diff --git a/.gitignore b/.gitignore
index b641685e34f2d11700d9ad75ceaf21fa65f2f665..10a199b667ef32d75f52423f819da4dbb9058df8 100644
Binary files a/.gitignore and b/.gitignore differ
diff --git a/Dockerfile b/Dockerfile
index 165497eb893ca8aeae6f6f28191e906d973fa278..4aaa8cb4a808c4490e62cc3e95d0fc5a91c8e48c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,15 +1,33 @@
-FROM python:3.10-slim
+FROM python:3.13-slim
 
-# System dependencies for bitsandbytes
+# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
 
-WORKDIR /app
+# Hugging Face Spaces requires running as a non-root user (UID 1000)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
 
-COPY requirements.txt .
+WORKDIR $HOME/app
+
+# Copy requirements
+COPY --chown=user requirements.txt .
+
+# Install PyTorch 2.11 with cu126 (matching our local battle-tested environment)
+RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 --upgrade
+
+# Install remaining requirements and Triton explicitly for Linux cloud environment
 RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir triton
+
+# Copy application files
+COPY --chown=user . .
 
-COPY . .
+# Expose the mandatory port required by Hugging Face Spaces
+EXPOSE 7860
 
-CMD ["bash"]
+# Launch the core FastAPI application on the expected HF Spaces port
+CMD ["python", "-m", "uvicorn", "src.api.server:app", "--host", "0.0.0.0", "--port", "7860"]
diff --git a/Include/Python.h b/Include/Python.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f75de6cb42f42948307c93f2392cb3c74b5a2da
--- /dev/null
+++ b/Include/Python.h
@@ -0,0 +1,140 @@
+// Entry point of the Python C API.
+// C extensions should only #include <Python.h>, and not include directly
+// the other Python header files included by <Python.h>.
+
+#ifndef Py_PYTHON_H
+#define Py_PYTHON_H
+
+// Since this is a "meta-include" file, "#ifdef __cplusplus / extern "C" {"
+// is not needed.
+
+
+// Include Python header files
+#include "patchlevel.h"
+#include "pyconfig.h"
+#include "pymacconfig.h"
+
+
+// Include standard header files
+// When changing these files, remember to update Doc/extending/extending.rst.
+#include <assert.h>               // assert()
+#include <inttypes.h>             // uintptr_t
+#include <limits.h>               // INT_MAX
+#include <math.h>                 // HUGE_VAL
+#include <stdarg.h>               // va_list
+#include <wchar.h>                // wchar_t
+#ifdef HAVE_SYS_TYPES_H
+#  include <sys/types.h>          // ssize_t
+#endif
+
+// <errno.h>, <stdio.h>, <stdlib.h> and <string.h> headers are no longer used
+// by Python, but kept for the backward compatibility of existing third party C
+// extensions. They are not included by limited C API version 3.11 and newer.
+//
+// The <ctype.h> and <unistd.h> headers are not included by limited C API
+// version 3.13 and newer.
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  include <errno.h>              // errno
+#  include <stdio.h>              // FILE*
+#  include <stdlib.h>             // getenv()
+#  include <string.h>             // memcpy()
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030d0000
+#  include <ctype.h>              // tolower()
+#  ifndef MS_WINDOWS
+#    include <unistd.h>           // close()
+#  endif
+#endif
+
+// gh-111506: The free-threaded build is not compatible with the limited API
+// or the stable ABI.
+#if defined(Py_LIMITED_API) && defined(Py_GIL_DISABLED)
+#  error "The limited API is not currently supported in the free-threaded build"
+#endif
+
+#if defined(Py_GIL_DISABLED) && defined(_MSC_VER)
+#  include <intrin.h>             // __readgsqword()
+#endif
+
+#if defined(Py_GIL_DISABLED) && defined(__MINGW32__)
+#  include <intrin.h>             // __readgsqword()
+#endif
+
+// Include Python header files
+#include "pyport.h"
+#include "pymacro.h"
+#include "pymath.h"
+#include "pymem.h"
+#include "pytypedefs.h"
+#include "pybuffer.h"
+#include "pystats.h"
+#include "pyatomic.h"
+#include "lock.h"
+#include "object.h"
+#include "objimpl.h"
+#include "typeslots.h"
+#include "pyhash.h"
+#include "cpython/pydebug.h"
+#include "bytearrayobject.h"
+#include "bytesobject.h"
+#include "unicodeobject.h"
+#include "pyerrors.h"
+#include "longobject.h"
+#include "cpython/longintrepr.h"
+#include "boolobject.h"
+#include "floatobject.h"
+#include "complexobject.h"
+#include "rangeobject.h"
+#include "memoryobject.h"
+#include "tupleobject.h"
+#include "listobject.h"
+#include "dictobject.h"
+#include "cpython/odictobject.h"
+#include "enumobject.h"
+#include "setobject.h"
+#include "methodobject.h"
+#include "moduleobject.h"
+#include "monitoring.h"
+#include "cpython/funcobject.h"
+#include "cpython/classobject.h"
+#include "fileobject.h"
+#include "pycapsule.h"
+#include "cpython/code.h"
+#include "pyframe.h"
+#include "traceback.h"
+#include "sliceobject.h"
+#include "cpython/cellobject.h"
+#include "iterobject.h"
+#include "cpython/initconfig.h"
+#include "pystate.h"
+#include "cpython/genobject.h"
+#include "descrobject.h"
+#include "genericaliasobject.h"
+#include "warnings.h"
+#include "weakrefobject.h"
+#include "structseq.h"
+#include "cpython/picklebufobject.h"
+#include "cpython/pytime.h"
+#include "codecs.h"
+#include "pythread.h"
+#include "cpython/context.h"
+#include "modsupport.h"
+#include "compile.h"
+#include "pythonrun.h"
+#include "pylifecycle.h"
+#include "ceval.h"
+#include "sysmodule.h"
+#include "osmodule.h"
+#include "intrcheck.h"
+#include "import.h"
+#include "abstract.h"
+#include "bltinmodule.h"
+#include "critical_section.h"
+#include "cpython/pyctype.h"
+#include "pystrtod.h"
+#include "pystrcmp.h"
+#include "fileutils.h"
+#include "cpython/pyfpe.h"
+#include "cpython/tracemalloc.h"
+
+#endif /* !Py_PYTHON_H */
diff --git a/Include/abstract.h b/Include/abstract.h
new file mode 100644
index 0000000000000000000000000000000000000000..98e1bbe4100fc75ddc191a1045b782046ab247e6
--- /dev/null
+++ b/Include/abstract.h
@@ -0,0 +1,921 @@
+/* Abstract Object Interface (many thanks to Jim Fulton) */
+
+#ifndef Py_ABSTRACTOBJECT_H
+#define Py_ABSTRACTOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* === Object Protocol ================================================== */
+
+/* Implemented elsewhere:
+
+   int PyObject_Print(PyObject *o, FILE *fp, int flags);
+
+   Print an object 'o' on file 'fp'.  Returns -1 on error. The flags argument
+   is used to enable certain printing options. The only option currently
+   supported is Py_PRINT_RAW. By default (flags=0), PyObject_Print() formats
+   the object by calling PyObject_Repr(). If flags equals to Py_PRINT_RAW, it
+   formats the object by calling PyObject_Str(). */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_HasAttrString(PyObject *o, const char *attr_name);
+
+   Returns 1 if object 'o' has the attribute attr_name, and 0 otherwise.
+
+   This is equivalent to the Python expression: hasattr(o,attr_name).
+
+   This function always succeeds. */
+
+
+/* Implemented elsewhere:
+
+   PyObject* PyObject_GetAttrString(PyObject *o, const char *attr_name);
+
+   Retrieve an attributed named attr_name form object o.
+   Returns the attribute value on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: o.attr_name. */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_HasAttr(PyObject *o, PyObject *attr_name);
+
+   Returns 1 if o has the attribute attr_name, and 0 otherwise.
+
+   This is equivalent to the Python expression: hasattr(o,attr_name).
+
+   This function always succeeds. */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_HasAttrStringWithError(PyObject *o, const char *attr_name);
+
+   Returns 1 if object 'o' has the attribute attr_name, and 0 otherwise.
+   This is equivalent to the Python expression: hasattr(o,attr_name).
+   Returns -1 on failure. */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_HasAttrWithError(PyObject *o, PyObject *attr_name);
+
+   Returns 1 if o has the attribute attr_name, and 0 otherwise.
+   This is equivalent to the Python expression: hasattr(o,attr_name).
+   Returns -1 on failure. */
+
+
+/* Implemented elsewhere:
+
+   PyObject* PyObject_GetAttr(PyObject *o, PyObject *attr_name);
+
+   Retrieve an attributed named 'attr_name' form object 'o'.
+   Returns the attribute value on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: o.attr_name. */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_GetOptionalAttr(PyObject *obj, PyObject *attr_name, PyObject **result);
+
+   Variant of PyObject_GetAttr() which doesn't raise AttributeError
+   if the attribute is not found.
+
+   If the attribute is found, return 1 and set *result to a new strong
+   reference to the attribute.
+   If the attribute is not found, return 0 and set *result to NULL;
+   the AttributeError is silenced.
+   If an error other than AttributeError is raised, return -1 and
+   set *result to NULL.
+*/
+
+
+/* Implemented elsewhere:
+
+   int PyObject_GetOptionalAttrString(PyObject *obj, const char *attr_name, PyObject **result);
+
+   Variant of PyObject_GetAttrString() which doesn't raise AttributeError
+   if the attribute is not found.
+
+   If the attribute is found, return 1 and set *result to a new strong
+   reference to the attribute.
+   If the attribute is not found, return 0 and set *result to NULL;
+   the AttributeError is silenced.
+   If an error other than AttributeError is raised, return -1 and
+   set *result to NULL.
+*/
+
+
+/* Implemented elsewhere:
+
+   int PyObject_SetAttrString(PyObject *o, const char *attr_name, PyObject *v);
+
+   Set the value of the attribute named attr_name, for object 'o',
+   to the value 'v'. Raise an exception and return -1 on failure; return 0 on
+   success.
+
+   This is the equivalent of the Python statement o.attr_name=v. */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_SetAttr(PyObject *o, PyObject *attr_name, PyObject *v);
+
+   Set the value of the attribute named attr_name, for object 'o', to the value
+   'v'. an exception and return -1 on failure; return 0 on success.
+
+   This is the equivalent of the Python statement o.attr_name=v. */
+
+/* Implemented elsewhere:
+
+   int PyObject_DelAttrString(PyObject *o, const char *attr_name);
+
+   Delete attribute named attr_name, for object o. Returns
+   -1 on failure.
+
+   This is the equivalent of the Python statement: del o.attr_name.
+
+   Implemented as a macro in the limited C API 3.12 and older. */
+#if defined(Py_LIMITED_API) && Py_LIMITED_API+0 < 0x030d0000
+#  define PyObject_DelAttrString(O, A) PyObject_SetAttrString((O), (A), NULL)
+#endif
+
+
+/* Implemented elsewhere:
+
+   int PyObject_DelAttr(PyObject *o, PyObject *attr_name);
+
+   Delete attribute named attr_name, for object o. Returns -1
+   on failure.  This is the equivalent of the Python
+   statement: del o.attr_name.
+
+   Implemented as a macro in the limited C API 3.12 and older. */
+#if defined(Py_LIMITED_API) && Py_LIMITED_API+0 < 0x030d0000
+#  define PyObject_DelAttr(O, A) PyObject_SetAttr((O), (A), NULL)
+#endif
+
+
+/* Implemented elsewhere:
+
+   PyObject *PyObject_Repr(PyObject *o);
+
+   Compute the string representation of object 'o'.  Returns the
+   string representation on success, NULL on failure.
+
+   This is the equivalent of the Python expression: repr(o).
+
+   Called by the repr() built-in function. */
+
+
+/* Implemented elsewhere:
+
+   PyObject *PyObject_Str(PyObject *o);
+
+   Compute the string representation of object, o.  Returns the
+   string representation on success, NULL on failure.
+
+   This is the equivalent of the Python expression: str(o).
+
+   Called by the str() and print() built-in functions. */
+
+
+/* Declared elsewhere
+
+   PyAPI_FUNC(int) PyCallable_Check(PyObject *o);
+
+   Determine if the object, o, is callable.  Return 1 if the object is callable
+   and 0 otherwise.
+
+   This function always succeeds. */
+
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+/* Call a callable Python object without any arguments */
+PyAPI_FUNC(PyObject *) PyObject_CallNoArgs(PyObject *func);
+#endif
+
+
+/* Call a callable Python object 'callable' with arguments given by the
+   tuple 'args' and keywords arguments given by the dictionary 'kwargs'.
+
+   'args' must not be NULL, use an empty tuple if no arguments are
+   needed. If no named arguments are needed, 'kwargs' can be NULL.
+
+   This is the equivalent of the Python expression:
+   callable(*args, **kwargs). */
+PyAPI_FUNC(PyObject *) PyObject_Call(PyObject *callable,
+                                     PyObject *args, PyObject *kwargs);
+
+
+/* Call a callable Python object 'callable', with arguments given by the
+   tuple 'args'.  If no arguments are needed, then 'args' can be NULL.
+
+   Returns the result of the call on success, or NULL on failure.
+
+   This is the equivalent of the Python expression:
+   callable(*args). */
+PyAPI_FUNC(PyObject *) PyObject_CallObject(PyObject *callable,
+                                           PyObject *args);
+
+/* Call a callable Python object, callable, with a variable number of C
+   arguments. The C arguments are described using a mkvalue-style format
+   string.
+
+   The format may be NULL, indicating that no arguments are provided.
+
+   Returns the result of the call on success, or NULL on failure.
+
+   This is the equivalent of the Python expression:
+   callable(arg1, arg2, ...). */
+PyAPI_FUNC(PyObject *) PyObject_CallFunction(PyObject *callable,
+                                             const char *format, ...);
+
+/* Call the method named 'name' of object 'obj' with a variable number of
+   C arguments.  The C arguments are described by a mkvalue format string.
+
+   The format can be NULL, indicating that no arguments are provided.
+
+   Returns the result of the call on success, or NULL on failure.
+
+   This is the equivalent of the Python expression:
+   obj.name(arg1, arg2, ...). */
+PyAPI_FUNC(PyObject *) PyObject_CallMethod(PyObject *obj,
+                                           const char *name,
+                                           const char *format, ...);
+
+/* Call a callable Python object 'callable' with a variable number of C
+   arguments. The C arguments are provided as PyObject* values, terminated
+   by a NULL.
+
+   Returns the result of the call on success, or NULL on failure.
+
+   This is the equivalent of the Python expression:
+   callable(arg1, arg2, ...). */
+PyAPI_FUNC(PyObject *) PyObject_CallFunctionObjArgs(PyObject *callable,
+                                                    ...);
+
+/* Call the method named 'name' of object 'obj' with a variable number of
+   C arguments.  The C arguments are provided as PyObject* values, terminated
+   by NULL.
+
+   Returns the result of the call on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: obj.name(*args). */
+
+PyAPI_FUNC(PyObject *) PyObject_CallMethodObjArgs(
+    PyObject *obj,
+    PyObject *name,
+    ...);
+
+/* Given a vectorcall nargsf argument, return the actual number of arguments.
+ * (For use outside the limited API, this is re-defined as a static inline
+ * function in cpython/abstract.h)
+ */
+PyAPI_FUNC(Py_ssize_t) PyVectorcall_NARGS(size_t nargsf);
+
+/* Call "callable" (which must support vectorcall) with positional arguments
+   "tuple" and keyword arguments "dict". "dict" may also be NULL */
+PyAPI_FUNC(PyObject *) PyVectorcall_Call(PyObject *callable, PyObject *tuple, PyObject *dict);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030C0000
+#define PY_VECTORCALL_ARGUMENTS_OFFSET \
+    (_Py_STATIC_CAST(size_t, 1) << (8 * sizeof(size_t) - 1))
+
+/* Perform a PEP 590-style vector call on 'callable' */
+PyAPI_FUNC(PyObject *) PyObject_Vectorcall(
+    PyObject *callable,
+    PyObject *const *args,
+    size_t nargsf,
+    PyObject *kwnames);
+
+/* Call the method 'name' on args[0] with arguments in args[1..nargsf-1]. */
+PyAPI_FUNC(PyObject *) PyObject_VectorcallMethod(
+    PyObject *name, PyObject *const *args,
+    size_t nargsf, PyObject *kwnames);
+#endif
+
+/* Implemented elsewhere:
+
+   Py_hash_t PyObject_Hash(PyObject *o);
+
+   Compute and return the hash, hash_value, of an object, o.  On
+   failure, return -1.
+
+   This is the equivalent of the Python expression: hash(o). */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_IsTrue(PyObject *o);
+
+   Returns 1 if the object, o, is considered to be true, 0 if o is
+   considered to be false and -1 on failure.
+
+   This is equivalent to the Python expression: not not o. */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_Not(PyObject *o);
+
+   Returns 0 if the object, o, is considered to be true, 1 if o is
+   considered to be false and -1 on failure.
+
+   This is equivalent to the Python expression: not o. */
+
+
+/* Get the type of an object.
+
+   On success, returns a type object corresponding to the object type of object
+   'o'. On failure, returns NULL.
+
+   This is equivalent to the Python expression: type(o) */
+PyAPI_FUNC(PyObject *) PyObject_Type(PyObject *o);
+
+
+/* Return the size of object 'o'.  If the object 'o' provides both sequence and
+   mapping protocols, the sequence size is returned.
+
+   On error, -1 is returned.
+
+   This is the equivalent to the Python expression: len(o) */
+PyAPI_FUNC(Py_ssize_t) PyObject_Size(PyObject *o);
+
+
+/* For DLL compatibility */
+#undef PyObject_Length
+PyAPI_FUNC(Py_ssize_t) PyObject_Length(PyObject *o);
+#define PyObject_Length PyObject_Size
+
+/* Return element of 'o' corresponding to the object 'key'. Return NULL
+  on failure.
+
+  This is the equivalent of the Python expression: o[key] */
+PyAPI_FUNC(PyObject *) PyObject_GetItem(PyObject *o, PyObject *key);
+
+
+/* Map the object 'key' to the value 'v' into 'o'.
+
+   Raise an exception and return -1 on failure; return 0 on success.
+
+   This is the equivalent of the Python statement: o[key]=v. */
+PyAPI_FUNC(int) PyObject_SetItem(PyObject *o, PyObject *key, PyObject *v);
+
+/* Remove the mapping for the string 'key' from the object 'o'.
+   Returns -1 on failure.
+
+   This is equivalent to the Python statement: del o[key]. */
+PyAPI_FUNC(int) PyObject_DelItemString(PyObject *o, const char *key);
+
+/* Delete the mapping for the object 'key' from the object 'o'.
+   Returns -1 on failure.
+
+   This is the equivalent of the Python statement: del o[key]. */
+PyAPI_FUNC(int) PyObject_DelItem(PyObject *o, PyObject *key);
+
+
+/* Takes an arbitrary object and returns the result of calling
+   obj.__format__(format_spec). */
+PyAPI_FUNC(PyObject *) PyObject_Format(PyObject *obj,
+                                       PyObject *format_spec);
+
+
+/* ==== Iterators ================================================ */
+
+/* Takes an object and returns an iterator for it.
+   This is typically a new iterator but if the argument is an iterator, this
+   returns itself. */
+PyAPI_FUNC(PyObject *) PyObject_GetIter(PyObject *);
+
+/* Takes an AsyncIterable object and returns an AsyncIterator for it.
+   This is typically a new iterator but if the argument is an AsyncIterator,
+   this returns itself. */
+PyAPI_FUNC(PyObject *) PyObject_GetAIter(PyObject *);
+
+/* Returns non-zero if the object 'obj' provides iterator protocols, and 0 otherwise.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyIter_Check(PyObject *);
+
+/* Returns non-zero if the object 'obj' provides AsyncIterator protocols, and 0 otherwise.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyAIter_Check(PyObject *);
+
+/* Takes an iterator object and calls its tp_iternext slot,
+   returning the next value.
+
+   If the iterator is exhausted, this returns NULL without setting an
+   exception.
+
+   NULL with an exception means an error occurred. */
+PyAPI_FUNC(PyObject *) PyIter_Next(PyObject *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+
+/* Takes generator, coroutine or iterator object and sends the value into it.
+   Returns:
+   - PYGEN_RETURN (0) if generator has returned.
+     'result' parameter is filled with return value
+   - PYGEN_ERROR (-1) if exception was raised.
+     'result' parameter is NULL
+   - PYGEN_NEXT (1) if generator has yielded.
+     'result' parameter is filled with yielded value. */
+PyAPI_FUNC(PySendResult) PyIter_Send(PyObject *, PyObject *, PyObject **);
+#endif
+
+
+/* === Number Protocol ================================================== */
+
+/* Returns 1 if the object 'o' provides numeric protocols, and 0 otherwise.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyNumber_Check(PyObject *o);
+
+/* Returns the result of adding o1 and o2, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 + o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Add(PyObject *o1, PyObject *o2);
+
+/* Returns the result of subtracting o2 from o1, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 - o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Subtract(PyObject *o1, PyObject *o2);
+
+/* Returns the result of multiplying o1 and o2, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 * o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Multiply(PyObject *o1, PyObject *o2);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* This is the equivalent of the Python expression: o1 @ o2. */
+PyAPI_FUNC(PyObject *) PyNumber_MatrixMultiply(PyObject *o1, PyObject *o2);
+#endif
+
+/* Returns the result of dividing o1 by o2 giving an integral result,
+   or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 // o2. */
+PyAPI_FUNC(PyObject *) PyNumber_FloorDivide(PyObject *o1, PyObject *o2);
+
+/* Returns the result of dividing o1 by o2 giving a float result, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 / o2. */
+PyAPI_FUNC(PyObject *) PyNumber_TrueDivide(PyObject *o1, PyObject *o2);
+
+/* Returns the remainder of dividing o1 by o2, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 % o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Remainder(PyObject *o1, PyObject *o2);
+
+/* See the built-in function divmod.
+
+   Returns NULL on failure.
+
+   This is the equivalent of the Python expression: divmod(o1, o2). */
+PyAPI_FUNC(PyObject *) PyNumber_Divmod(PyObject *o1, PyObject *o2);
+
+/* See the built-in function pow. Returns NULL on failure.
+
+   This is the equivalent of the Python expression: pow(o1, o2, o3),
+   where o3 is optional. */
+PyAPI_FUNC(PyObject *) PyNumber_Power(PyObject *o1, PyObject *o2,
+                                      PyObject *o3);
+
+/* Returns the negation of o on success, or NULL on failure.
+
+ This is the equivalent of the Python expression: -o. */
+PyAPI_FUNC(PyObject *) PyNumber_Negative(PyObject *o);
+
+/* Returns the positive of o on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: +o. */
+PyAPI_FUNC(PyObject *) PyNumber_Positive(PyObject *o);
+
+/* Returns the absolute value of 'o', or NULL on failure.
+
+   This is the equivalent of the Python expression: abs(o). */
+PyAPI_FUNC(PyObject *) PyNumber_Absolute(PyObject *o);
+
+/* Returns the bitwise negation of 'o' on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: ~o. */
+PyAPI_FUNC(PyObject *) PyNumber_Invert(PyObject *o);
+
+/* Returns the result of left shifting o1 by o2 on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 << o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Lshift(PyObject *o1, PyObject *o2);
+
+/* Returns the result of right shifting o1 by o2 on success, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 >> o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Rshift(PyObject *o1, PyObject *o2);
+
+/* Returns the result of bitwise and of o1 and o2 on success, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 & o2. */
+PyAPI_FUNC(PyObject *) PyNumber_And(PyObject *o1, PyObject *o2);
+
+/* Returns the bitwise exclusive or of o1 by o2 on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 ^ o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Xor(PyObject *o1, PyObject *o2);
+
+/* Returns the result of bitwise or on o1 and o2 on success, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 | o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Or(PyObject *o1, PyObject *o2);
+
+/* Returns 1 if obj is an index integer (has the nb_index slot of the
+   tp_as_number structure filled in), and 0 otherwise. */
+PyAPI_FUNC(int) PyIndex_Check(PyObject *);
+
+/* Returns the object 'o' converted to a Python int, or NULL with an exception
+   raised on failure. */
+PyAPI_FUNC(PyObject *) PyNumber_Index(PyObject *o);
+
+/* Returns the object 'o' converted to Py_ssize_t by going through
+   PyNumber_Index() first.
+
+   If an overflow error occurs while converting the int to Py_ssize_t, then the
+   second argument 'exc' is the error-type to return.  If it is NULL, then the
+   overflow error is cleared and the value is clipped. */
+PyAPI_FUNC(Py_ssize_t) PyNumber_AsSsize_t(PyObject *o, PyObject *exc);
+
+/* Returns the object 'o' converted to an integer object on success, or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: int(o). */
+PyAPI_FUNC(PyObject *) PyNumber_Long(PyObject *o);
+
+/* Returns the object 'o' converted to a float object on success, or NULL
+  on failure.
+
+  This is the equivalent of the Python expression: float(o). */
+PyAPI_FUNC(PyObject *) PyNumber_Float(PyObject *o);
+
+
+/* --- In-place variants of (some of) the above number protocol functions -- */
+
+/* Returns the result of adding o2 to o1, possibly in-place, or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: o1 += o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceAdd(PyObject *o1, PyObject *o2);
+
+/* Returns the result of subtracting o2 from o1, possibly in-place or
+   NULL on failure.
+
+   This is the equivalent of the Python expression: o1 -= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceSubtract(PyObject *o1, PyObject *o2);
+
+/* Returns the result of multiplying o1 by o2, possibly in-place, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 *= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceMultiply(PyObject *o1, PyObject *o2);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* This is the equivalent of the Python expression: o1 @= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceMatrixMultiply(PyObject *o1, PyObject *o2);
+#endif
+
+/* Returns the result of dividing o1 by o2 giving an integral result, possibly
+   in-place, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 /= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceFloorDivide(PyObject *o1,
+                                                   PyObject *o2);
+
+/* Returns the result of dividing o1 by o2 giving a float result, possibly
+   in-place, or null on failure.
+
+   This is the equivalent of the Python expression: o1 /= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceTrueDivide(PyObject *o1,
+                                                  PyObject *o2);
+
+/* Returns the remainder of dividing o1 by o2, possibly in-place, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 %= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceRemainder(PyObject *o1, PyObject *o2);
+
+/* Returns the result of raising o1 to the power of o2, possibly in-place,
+   or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 **= o2,
+   or o1 = pow(o1, o2, o3) if o3 is present. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlacePower(PyObject *o1, PyObject *o2,
+                                             PyObject *o3);
+
+/* Returns the result of left shifting o1 by o2, possibly in-place, or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: o1 <<= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceLshift(PyObject *o1, PyObject *o2);
+
+/* Returns the result of right shifting o1 by o2, possibly in-place or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: o1 >>= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceRshift(PyObject *o1, PyObject *o2);
+
+/* Returns the result of bitwise and of o1 and o2, possibly in-place, or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: o1 &= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceAnd(PyObject *o1, PyObject *o2);
+
+/* Returns the bitwise exclusive or of o1 by o2, possibly in-place, or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: o1 ^= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceXor(PyObject *o1, PyObject *o2);
+
+/* Returns the result of bitwise or of o1 and o2, possibly in-place,
+   or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 |= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceOr(PyObject *o1, PyObject *o2);
+
+/* Returns the integer n converted to a string with a base, with a base
+   marker of 0b, 0o or 0x prefixed if applicable.
+
+   If n is not an int object, it is converted with PyNumber_Index first. */
+PyAPI_FUNC(PyObject *) PyNumber_ToBase(PyObject *n, int base);
+
+
+/* === Sequence protocol ================================================ */
+
+/* Return 1 if the object provides sequence protocol, and zero
+   otherwise.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PySequence_Check(PyObject *o);
+
+/* Return the size of sequence object o, or -1 on failure. */
+PyAPI_FUNC(Py_ssize_t) PySequence_Size(PyObject *o);
+
+/* For DLL compatibility */
+#undef PySequence_Length
+PyAPI_FUNC(Py_ssize_t) PySequence_Length(PyObject *o);
+#define PySequence_Length PySequence_Size
+
+
+/* Return the concatenation of o1 and o2 on success, and NULL on failure.
+
+   This is the equivalent of the Python expression: o1 + o2. */
+PyAPI_FUNC(PyObject *) PySequence_Concat(PyObject *o1, PyObject *o2);
+
+/* Return the result of repeating sequence object 'o' 'count' times,
+  or NULL on failure.
+
+  This is the equivalent of the Python expression: o * count. */
+PyAPI_FUNC(PyObject *) PySequence_Repeat(PyObject *o, Py_ssize_t count);
+
+/* Return the ith element of o, or NULL on failure.
+
+   This is the equivalent of the Python expression: o[i]. */
+PyAPI_FUNC(PyObject *) PySequence_GetItem(PyObject *o, Py_ssize_t i);
+
+/* Return the slice of sequence object o between i1 and i2, or NULL on failure.
+
+   This is the equivalent of the Python expression: o[i1:i2]. */
+PyAPI_FUNC(PyObject *) PySequence_GetSlice(PyObject *o, Py_ssize_t i1, Py_ssize_t i2);
+
+/* Assign object 'v' to the ith element of the sequence 'o'. Raise an exception
+   and return -1 on failure; return 0 on success.
+
+   This is the equivalent of the Python statement o[i] = v. */
+PyAPI_FUNC(int) PySequence_SetItem(PyObject *o, Py_ssize_t i, PyObject *v);
+
+/* Delete the 'i'-th element of the sequence 'v'. Returns -1 on failure.
+
+   This is the equivalent of the Python statement: del o[i]. */
+PyAPI_FUNC(int) PySequence_DelItem(PyObject *o, Py_ssize_t i);
+
+/* Assign the sequence object 'v' to the slice in sequence object 'o',
+   from 'i1' to 'i2'. Returns -1 on failure.
+
+   This is the equivalent of the Python statement: o[i1:i2] = v. */
+PyAPI_FUNC(int) PySequence_SetSlice(PyObject *o, Py_ssize_t i1, Py_ssize_t i2,
+                                    PyObject *v);
+
+/* Delete the slice in sequence object 'o' from 'i1' to 'i2'.
+   Returns -1 on failure.
+
+   This is the equivalent of the Python statement: del o[i1:i2]. */
+PyAPI_FUNC(int) PySequence_DelSlice(PyObject *o, Py_ssize_t i1, Py_ssize_t i2);
+
+/* Returns the sequence 'o' as a tuple on success, and NULL on failure.
+
+   This is equivalent to the Python expression: tuple(o). */
+PyAPI_FUNC(PyObject *) PySequence_Tuple(PyObject *o);
+
+/* Returns the sequence 'o' as a list on success, and NULL on failure.
+   This is equivalent to the Python expression: list(o) */
+PyAPI_FUNC(PyObject *) PySequence_List(PyObject *o);
+
+/* Return the sequence 'o' as a list, unless it's already a tuple or list.
+
+   Use PySequence_Fast_GET_ITEM to access the members of this list, and
+   PySequence_Fast_GET_SIZE to get its length.
+
+   Returns NULL on failure.  If the object does not support iteration, raises a
+   TypeError exception with 'm' as the message text. */
+PyAPI_FUNC(PyObject *) PySequence_Fast(PyObject *o, const char* m);
+
+/* Return the size of the sequence 'o', assuming that 'o' was returned by
+   PySequence_Fast and is not NULL. */
+#define PySequence_Fast_GET_SIZE(o) \
+    (PyList_Check(o) ? PyList_GET_SIZE(o) : PyTuple_GET_SIZE(o))
+
+/* Return the 'i'-th element of the sequence 'o', assuming that o was returned
+   by PySequence_Fast, and that i is within bounds. */
+#define PySequence_Fast_GET_ITEM(o, i)\
+     (PyList_Check(o) ? PyList_GET_ITEM((o), (i)) : PyTuple_GET_ITEM((o), (i)))
+
+/* Return a pointer to the underlying item array for
+   an object returned by PySequence_Fast */
+#define PySequence_Fast_ITEMS(sf) \
+    (PyList_Check(sf) ? ((PyListObject *)(sf))->ob_item \
+                      : ((PyTupleObject *)(sf))->ob_item)
+
+/* Return the number of occurrences on value on 'o', that is, return
+   the number of keys for which o[key] == value.
+
+   On failure, return -1.  This is equivalent to the Python expression:
+   o.count(value). */
+PyAPI_FUNC(Py_ssize_t) PySequence_Count(PyObject *o, PyObject *value);
+
+/* Return 1 if 'ob' is in the sequence 'seq'; 0 if 'ob' is not in the sequence
+   'seq'; -1 on error.
+
+   Use __contains__ if possible, else _PySequence_IterSearch(). */
+PyAPI_FUNC(int) PySequence_Contains(PyObject *seq, PyObject *ob);
+
+/* For DLL-level backwards compatibility */
+#undef PySequence_In
+/* Determine if the sequence 'o' contains 'value'. If an item in 'o' is equal
+   to 'value', return 1, otherwise return 0. On error, return -1.
+
+   This is equivalent to the Python expression: value in o. */
+PyAPI_FUNC(int) PySequence_In(PyObject *o, PyObject *value);
+
+/* For source-level backwards compatibility */
+#define PySequence_In PySequence_Contains
+
+
+/* Return the first index for which o[i] == value.
+   On error, return -1.
+
+   This is equivalent to the Python expression: o.index(value). */
+PyAPI_FUNC(Py_ssize_t) PySequence_Index(PyObject *o, PyObject *value);
+
+
+/* --- In-place versions of some of the above Sequence functions --- */
+
+/* Append sequence 'o2' to sequence 'o1', in-place when possible. Return the
+   resulting object, which could be 'o1', or NULL on failure.
+
+  This is the equivalent of the Python expression: o1 += o2. */
+PyAPI_FUNC(PyObject *) PySequence_InPlaceConcat(PyObject *o1, PyObject *o2);
+
+/* Repeat sequence 'o' by 'count', in-place when possible. Return the resulting
+   object, which could be 'o', or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 *= count.  */
+PyAPI_FUNC(PyObject *) PySequence_InPlaceRepeat(PyObject *o, Py_ssize_t count);
+
+
+/* === Mapping protocol ================================================= */
+
+/* Return 1 if the object provides mapping protocol, and 0 otherwise.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyMapping_Check(PyObject *o);
+
+/* Returns the number of keys in mapping object 'o' on success, and -1 on
+  failure. This is equivalent to the Python expression: len(o). */
+PyAPI_FUNC(Py_ssize_t) PyMapping_Size(PyObject *o);
+
+/* For DLL compatibility */
+#undef PyMapping_Length
+PyAPI_FUNC(Py_ssize_t) PyMapping_Length(PyObject *o);
+#define PyMapping_Length PyMapping_Size
+
+
+/* Implemented as a macro:
+
+   int PyMapping_DelItemString(PyObject *o, const char *key);
+
+   Remove the mapping for the string 'key' from the mapping 'o'. Returns -1 on
+   failure.
+
+   This is equivalent to the Python statement: del o[key]. */
+#define PyMapping_DelItemString(O, K) PyObject_DelItemString((O), (K))
+
+/* Implemented as a macro:
+
+   int PyMapping_DelItem(PyObject *o, PyObject *key);
+
+   Remove the mapping for the object 'key' from the mapping object 'o'.
+   Returns -1 on failure.
+
+   This is equivalent to the Python statement: del o[key]. */
+#define PyMapping_DelItem(O, K) PyObject_DelItem((O), (K))
+
+/* On success, return 1 if the mapping object 'o' has the key 'key',
+   and 0 otherwise.
+
+   This is equivalent to the Python expression: key in o.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyMapping_HasKeyString(PyObject *o, const char *key);
+
+/* Return 1 if the mapping object has the key 'key', and 0 otherwise.
+
+   This is equivalent to the Python expression: key in o.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyMapping_HasKey(PyObject *o, PyObject *key);
+
+/* Return 1 if the mapping object has the key 'key', and 0 otherwise.
+   This is equivalent to the Python expression: key in o.
+   On failure, return -1. */
+
+PyAPI_FUNC(int) PyMapping_HasKeyWithError(PyObject *o, PyObject *key);
+
+/* Return 1 if the mapping object has the key 'key', and 0 otherwise.
+   This is equivalent to the Python expression: key in o.
+   On failure, return -1. */
+
+PyAPI_FUNC(int) PyMapping_HasKeyStringWithError(PyObject *o, const char *key);
+
+/* On success, return a list or tuple of the keys in mapping object 'o'.
+   On failure, return NULL. */
+PyAPI_FUNC(PyObject *) PyMapping_Keys(PyObject *o);
+
+/* On success, return a list or tuple of the values in mapping object 'o'.
+   On failure, return NULL. */
+PyAPI_FUNC(PyObject *) PyMapping_Values(PyObject *o);
+
+/* On success, return a list or tuple of the items in mapping object 'o',
+   where each item is a tuple containing a key-value pair. On failure, return
+   NULL. */
+PyAPI_FUNC(PyObject *) PyMapping_Items(PyObject *o);
+
+/* Return element of 'o' corresponding to the string 'key' or NULL on failure.
+
+   This is the equivalent of the Python expression: o[key]. */
+PyAPI_FUNC(PyObject *) PyMapping_GetItemString(PyObject *o,
+                                               const char *key);
+
+/* Variants of PyObject_GetItem() and PyMapping_GetItemString() which don't
+   raise KeyError if the key is not found.
+
+   If the key is found, return 1 and set *result to a new strong
+   reference to the corresponding value.
+   If the key is not found, return 0 and set *result to NULL;
+   the KeyError is silenced.
+   If an error other than KeyError is raised, return -1 and
+   set *result to NULL.
+*/
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+PyAPI_FUNC(int) PyMapping_GetOptionalItem(PyObject *, PyObject *, PyObject **);
+PyAPI_FUNC(int) PyMapping_GetOptionalItemString(PyObject *, const char *, PyObject **);
+#endif
+
+/* Map the string 'key' to the value 'v' in the mapping 'o'.
+   Returns -1 on failure.
+
+   This is the equivalent of the Python statement: o[key]=v. */
+PyAPI_FUNC(int) PyMapping_SetItemString(PyObject *o, const char *key,
+                                        PyObject *value);
+
+/* isinstance(object, typeorclass) */
+PyAPI_FUNC(int) PyObject_IsInstance(PyObject *object, PyObject *typeorclass);
+
+/* issubclass(object, typeorclass) */
+PyAPI_FUNC(int) PyObject_IsSubclass(PyObject *object, PyObject *typeorclass);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_ABSTRACTOBJECT_H
+#  include "cpython/abstract.h"
+#  undef Py_CPYTHON_ABSTRACTOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* Py_ABSTRACTOBJECT_H */
diff --git a/Include/bltinmodule.h b/Include/bltinmodule.h
new file mode 100644
index 0000000000000000000000000000000000000000..868c9e6443bfc1d1d48fb0806af1bf21490fc44c
--- /dev/null
+++ b/Include/bltinmodule.h
@@ -0,0 +1,14 @@
+#ifndef Py_BLTINMODULE_H
+#define Py_BLTINMODULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyFilter_Type;
+PyAPI_DATA(PyTypeObject) PyMap_Type;
+PyAPI_DATA(PyTypeObject) PyZip_Type;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_BLTINMODULE_H */
diff --git a/Include/boolobject.h b/Include/boolobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..b56e2baecaa36c54c11ea90ac50ffe9921b8b5e0
--- /dev/null
+++ b/Include/boolobject.h
@@ -0,0 +1,54 @@
+/* Boolean object interface */
+
+#ifndef Py_BOOLOBJECT_H
+#define Py_BOOLOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// PyBool_Type is declared by object.h
+
+#define PyBool_Check(x) Py_IS_TYPE((x), &PyBool_Type)
+
+/* Py_False and Py_True are the only two bools in existence. */
+
+/* Don't use these directly */
+PyAPI_DATA(PyLongObject) _Py_FalseStruct;
+PyAPI_DATA(PyLongObject) _Py_TrueStruct;
+
+/* Use these macros */
+#if defined(Py_LIMITED_API) && Py_LIMITED_API+0 >= 0x030D0000
+#  define Py_False Py_GetConstantBorrowed(Py_CONSTANT_FALSE)
+#  define Py_True Py_GetConstantBorrowed(Py_CONSTANT_TRUE)
+#else
+#  define Py_False _PyObject_CAST(&_Py_FalseStruct)
+#  define Py_True _PyObject_CAST(&_Py_TrueStruct)
+#endif
+
+// Test if an object is the True singleton, the same as "x is True" in Python.
+PyAPI_FUNC(int) Py_IsTrue(PyObject *x);
+#define Py_IsTrue(x) Py_Is((x), Py_True)
+
+// Test if an object is the False singleton, the same as "x is False" in Python.
+PyAPI_FUNC(int) Py_IsFalse(PyObject *x);
+#define Py_IsFalse(x) Py_Is((x), Py_False)
+
+/* Macros for returning Py_True or Py_False, respectively.
+ * Only treat Py_True and Py_False as immortal in the limited C API 3.12
+ * and newer. */
+#if defined(Py_LIMITED_API) && Py_LIMITED_API+0 < 0x030c0000
+#  define Py_RETURN_TRUE return Py_NewRef(Py_True)
+#  define Py_RETURN_FALSE return Py_NewRef(Py_False)
+#else
+#  define Py_RETURN_TRUE return Py_True
+#  define Py_RETURN_FALSE return Py_False
+#endif
+
+/* Function to return a bool from a C long */
+PyAPI_FUNC(PyObject *) PyBool_FromLong(long);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_BOOLOBJECT_H */
diff --git a/Include/bytearrayobject.h b/Include/bytearrayobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d53fdba6432672077aa97c2db0a58fc9cbc8414
--- /dev/null
+++ b/Include/bytearrayobject.h
@@ -0,0 +1,44 @@
+/* ByteArray object interface */
+
+#ifndef Py_BYTEARRAYOBJECT_H
+#define Py_BYTEARRAYOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Type PyByteArrayObject represents a mutable array of bytes.
+ * The Python API is that of a sequence;
+ * the bytes are mapped to ints in [0, 256).
+ * Bytes are not characters; they may be used to encode characters.
+ * The only way to go between bytes and str/unicode is via encoding
+ * and decoding.
+ * For the convenience of C programmers, the bytes type is considered
+ * to contain a char pointer, not an unsigned char pointer.
+ */
+
+/* Type object */
+PyAPI_DATA(PyTypeObject) PyByteArray_Type;
+PyAPI_DATA(PyTypeObject) PyByteArrayIter_Type;
+
+/* Type check macros */
+#define PyByteArray_Check(self) PyObject_TypeCheck((self), &PyByteArray_Type)
+#define PyByteArray_CheckExact(self) Py_IS_TYPE((self), &PyByteArray_Type)
+
+/* Direct API functions */
+PyAPI_FUNC(PyObject *) PyByteArray_FromObject(PyObject *);
+PyAPI_FUNC(PyObject *) PyByteArray_Concat(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyByteArray_FromStringAndSize(const char *, Py_ssize_t);
+PyAPI_FUNC(Py_ssize_t) PyByteArray_Size(PyObject *);
+PyAPI_FUNC(char *) PyByteArray_AsString(PyObject *);
+PyAPI_FUNC(int) PyByteArray_Resize(PyObject *, Py_ssize_t);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_BYTEARRAYOBJECT_H
+#  include "cpython/bytearrayobject.h"
+#  undef Py_CPYTHON_BYTEARRAYOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_BYTEARRAYOBJECT_H */
diff --git a/Include/bytesobject.h b/Include/bytesobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5a24195be6bc3771c3b5b65e85106d516b029e1
--- /dev/null
+++ b/Include/bytesobject.h
@@ -0,0 +1,66 @@
+// Bytes object interface
+
+#ifndef Py_BYTESOBJECT_H
+#define Py_BYTESOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+Type PyBytesObject represents a byte string.  An extra zero byte is
+reserved at the end to ensure it is zero-terminated, but a size is
+present so strings with null bytes in them can be represented.  This
+is an immutable object type.
+
+There are functions to create new bytes objects, to test
+an object for bytes-ness, and to get the
+byte string value.  The latter function returns a null pointer
+if the object is not of the proper type.
+There is a variant that takes an explicit size as well as a
+variant that assumes a zero-terminated string.  Note that none of the
+functions should be applied to NULL pointer.
+*/
+
+PyAPI_DATA(PyTypeObject) PyBytes_Type;
+PyAPI_DATA(PyTypeObject) PyBytesIter_Type;
+
+#define PyBytes_Check(op) \
+                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_BYTES_SUBCLASS)
+#define PyBytes_CheckExact(op) Py_IS_TYPE((op), &PyBytes_Type)
+
+PyAPI_FUNC(PyObject *) PyBytes_FromStringAndSize(const char *, Py_ssize_t);
+PyAPI_FUNC(PyObject *) PyBytes_FromString(const char *);
+PyAPI_FUNC(PyObject *) PyBytes_FromObject(PyObject *);
+PyAPI_FUNC(PyObject *) PyBytes_FromFormatV(const char*, va_list)
+                                Py_GCC_ATTRIBUTE((format(printf, 1, 0)));
+PyAPI_FUNC(PyObject *) PyBytes_FromFormat(const char*, ...)
+                                Py_GCC_ATTRIBUTE((format(printf, 1, 2)));
+PyAPI_FUNC(Py_ssize_t) PyBytes_Size(PyObject *);
+PyAPI_FUNC(char *) PyBytes_AsString(PyObject *);
+PyAPI_FUNC(PyObject *) PyBytes_Repr(PyObject *, int);
+PyAPI_FUNC(void) PyBytes_Concat(PyObject **, PyObject *);
+PyAPI_FUNC(void) PyBytes_ConcatAndDel(PyObject **, PyObject *);
+PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
+                                            const char *, Py_ssize_t,
+                                            const char *);
+
+/* Provides access to the internal data buffer and size of a bytes object.
+   Passing NULL as len parameter will force the string buffer to be
+   0-terminated (passing a string with embedded NUL characters will
+   cause an exception).  */
+PyAPI_FUNC(int) PyBytes_AsStringAndSize(
+    PyObject *obj,      /* bytes object */
+    char **s,           /* pointer to buffer variable */
+    Py_ssize_t *len     /* pointer to length variable or NULL */
+    );
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_BYTESOBJECT_H
+#  include "cpython/bytesobject.h"
+#  undef Py_CPYTHON_BYTESOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_BYTESOBJECT_H */
diff --git a/Include/ceval.h b/Include/ceval.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ec746c3708220edddb42e00a55d3a04dbd0b465
--- /dev/null
+++ b/Include/ceval.h
@@ -0,0 +1,145 @@
+/* Interface to random parts in ceval.c */
+
+#ifndef Py_CEVAL_H
+#define Py_CEVAL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+PyAPI_FUNC(PyObject *) PyEval_EvalCode(PyObject *, PyObject *, PyObject *);
+
+PyAPI_FUNC(PyObject *) PyEval_EvalCodeEx(PyObject *co,
+                                         PyObject *globals,
+                                         PyObject *locals,
+                                         PyObject *const *args, int argc,
+                                         PyObject *const *kwds, int kwdc,
+                                         PyObject *const *defs, int defc,
+                                         PyObject *kwdefs, PyObject *closure);
+
+PyAPI_FUNC(PyObject *) PyEval_GetBuiltins(void);
+PyAPI_FUNC(PyObject *) PyEval_GetGlobals(void);
+PyAPI_FUNC(PyObject *) PyEval_GetLocals(void);
+PyAPI_FUNC(PyFrameObject *) PyEval_GetFrame(void);
+
+PyAPI_FUNC(PyObject *) PyEval_GetFrameBuiltins(void);
+PyAPI_FUNC(PyObject *) PyEval_GetFrameGlobals(void);
+PyAPI_FUNC(PyObject *) PyEval_GetFrameLocals(void);
+
+PyAPI_FUNC(int) Py_AddPendingCall(int (*func)(void *), void *arg);
+PyAPI_FUNC(int) Py_MakePendingCalls(void);
+
+/* Protection against deeply nested recursive calls
+
+   In Python 3.0, this protection has two levels:
+   * normal anti-recursion protection is triggered when the recursion level
+     exceeds the current recursion limit. It raises a RecursionError, and sets
+     the "overflowed" flag in the thread state structure. This flag
+     temporarily *disables* the normal protection; this allows cleanup code
+     to potentially outgrow the recursion limit while processing the
+     RecursionError.
+   * "last chance" anti-recursion protection is triggered when the recursion
+     level exceeds "current recursion limit + 50". By construction, this
+     protection can only be triggered when the "overflowed" flag is set. It
+     means the cleanup code has itself gone into an infinite loop, or the
+     RecursionError has been mistakingly ignored. When this protection is
+     triggered, the interpreter aborts with a Fatal Error.
+
+   In addition, the "overflowed" flag is automatically reset when the
+   recursion level drops below "current recursion limit - 50". This heuristic
+   is meant to ensure that the normal anti-recursion protection doesn't get
+   disabled too long.
+
+   Please note: this scheme has its own limitations. See:
+   http://mail.python.org/pipermail/python-dev/2008-August/082106.html
+   for some observations.
+*/
+PyAPI_FUNC(void) Py_SetRecursionLimit(int);
+PyAPI_FUNC(int) Py_GetRecursionLimit(void);
+
+PyAPI_FUNC(int) Py_EnterRecursiveCall(const char *where);
+PyAPI_FUNC(void) Py_LeaveRecursiveCall(void);
+
+PyAPI_FUNC(const char *) PyEval_GetFuncName(PyObject *);
+PyAPI_FUNC(const char *) PyEval_GetFuncDesc(PyObject *);
+
+PyAPI_FUNC(PyObject *) PyEval_EvalFrame(PyFrameObject *);
+PyAPI_FUNC(PyObject *) PyEval_EvalFrameEx(PyFrameObject *f, int exc);
+
+/* Interface for threads.
+
+   A module that plans to do a blocking system call (or something else
+   that lasts a long time and doesn't touch Python data) can allow other
+   threads to run as follows:
+
+    ...preparations here...
+    Py_BEGIN_ALLOW_THREADS
+    ...blocking system call here...
+    Py_END_ALLOW_THREADS
+    ...interpret result here...
+
+   The Py_BEGIN_ALLOW_THREADS/Py_END_ALLOW_THREADS pair expands to a
+   {}-surrounded block.
+   To leave the block in the middle (e.g., with return), you must insert
+   a line containing Py_BLOCK_THREADS before the return, e.g.
+
+    if (...premature_exit...) {
+        Py_BLOCK_THREADS
+        PyErr_SetFromErrno(PyExc_OSError);
+        return NULL;
+    }
+
+   An alternative is:
+
+    Py_BLOCK_THREADS
+    if (...premature_exit...) {
+        PyErr_SetFromErrno(PyExc_OSError);
+        return NULL;
+    }
+    Py_UNBLOCK_THREADS
+
+   For convenience, that the value of 'errno' is restored across
+   Py_END_ALLOW_THREADS and Py_BLOCK_THREADS.
+
+   WARNING: NEVER NEST CALLS TO Py_BEGIN_ALLOW_THREADS AND
+   Py_END_ALLOW_THREADS!!!
+
+   Note that not yet all candidates have been converted to use this
+   mechanism!
+*/
+
+PyAPI_FUNC(PyThreadState *) PyEval_SaveThread(void);
+PyAPI_FUNC(void) PyEval_RestoreThread(PyThreadState *);
+
+Py_DEPRECATED(3.9) PyAPI_FUNC(void) PyEval_InitThreads(void);
+
+PyAPI_FUNC(void) PyEval_AcquireThread(PyThreadState *tstate);
+PyAPI_FUNC(void) PyEval_ReleaseThread(PyThreadState *tstate);
+
+#define Py_BEGIN_ALLOW_THREADS { \
+                        PyThreadState *_save; \
+                        _save = PyEval_SaveThread();
+#define Py_BLOCK_THREADS        PyEval_RestoreThread(_save);
+#define Py_UNBLOCK_THREADS      _save = PyEval_SaveThread();
+#define Py_END_ALLOW_THREADS    PyEval_RestoreThread(_save); \
+                 }
+
+/* Masks and values used by FORMAT_VALUE opcode. */
+#define FVC_MASK      0x3
+#define FVC_NONE      0x0
+#define FVC_STR       0x1
+#define FVC_REPR      0x2
+#define FVC_ASCII     0x3
+#define FVS_MASK      0x4
+#define FVS_HAVE_SPEC 0x4
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_CEVAL_H
+#  include "cpython/ceval.h"
+#  undef Py_CPYTHON_CEVAL_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CEVAL_H */
diff --git a/Include/codecs.h b/Include/codecs.h
new file mode 100644
index 0000000000000000000000000000000000000000..512a3c723eca18344cca0b44fcf278fc79b364ac
--- /dev/null
+++ b/Include/codecs.h
@@ -0,0 +1,176 @@
+#ifndef Py_CODECREGISTRY_H
+#define Py_CODECREGISTRY_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------------------------------------------------------------
+
+   Python Codec Registry and support functions
+
+
+Written by Marc-Andre Lemburg (mal@lemburg.com).
+
+Copyright (c) Corporation for National Research Initiatives.
+
+   ------------------------------------------------------------------------ */
+
+/* Register a new codec search function.
+
+   As side effect, this tries to load the encodings package, if not
+   yet done, to make sure that it is always first in the list of
+   search functions.
+
+   The search_function's refcount is incremented by this function. */
+
+PyAPI_FUNC(int) PyCodec_Register(
+       PyObject *search_function
+       );
+
+/* Unregister a codec search function and clear the registry's cache.
+   If the search function is not registered, do nothing.
+   Return 0 on success. Raise an exception and return -1 on error. */
+
+PyAPI_FUNC(int) PyCodec_Unregister(
+       PyObject *search_function
+       );
+
+/* Codec registry encoding check API.
+
+   Returns 1/0 depending on whether there is a registered codec for
+   the given encoding.
+
+*/
+
+PyAPI_FUNC(int) PyCodec_KnownEncoding(
+       const char *encoding
+       );
+
+/* Generic codec based encoding API.
+
+   object is passed through the encoder function found for the given
+   encoding using the error handling method defined by errors. errors
+   may be NULL to use the default method defined for the codec.
+
+   Raises a LookupError in case no encoder can be found.
+
+ */
+
+PyAPI_FUNC(PyObject *) PyCodec_Encode(
+       PyObject *object,
+       const char *encoding,
+       const char *errors
+       );
+
+/* Generic codec based decoding API.
+
+   object is passed through the decoder function found for the given
+   encoding using the error handling method defined by errors. errors
+   may be NULL to use the default method defined for the codec.
+
+   Raises a LookupError in case no encoder can be found.
+
+ */
+
+PyAPI_FUNC(PyObject *) PyCodec_Decode(
+       PyObject *object,
+       const char *encoding,
+       const char *errors
+       );
+
+// --- Codec Lookup APIs --------------------------------------------------
+
+/* Codec registry lookup API.
+
+   Looks up the given encoding and returns a CodecInfo object with
+   function attributes which implement the different aspects of
+   processing the encoding.
+
+   The encoding string is looked up converted to all lower-case
+   characters. This makes encodings looked up through this mechanism
+   effectively case-insensitive.
+
+   If no codec is found, a KeyError is set and NULL returned.
+
+   As side effect, this tries to load the encodings package, if not
+   yet done. This is part of the lazy load strategy for the encodings
+   package.
+ */
+
+/* Get an encoder function for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_Encoder(const char *encoding);
+
+/* Get a decoder function for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_Decoder(const char *encoding);
+
+/* Get an IncrementalEncoder object for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_IncrementalEncoder(
+   const char *encoding,
+   const char *errors);
+
+/* Get an IncrementalDecoder object function for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_IncrementalDecoder(
+   const char *encoding,
+   const char *errors);
+
+/* Get a StreamReader factory function for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_StreamReader(
+   const char *encoding,
+   PyObject *stream,
+   const char *errors);
+
+/* Get a StreamWriter factory function for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_StreamWriter(
+   const char *encoding,
+   PyObject *stream,
+   const char *errors);
+
+/* Unicode encoding error handling callback registry API */
+
+/* Register the error handling callback function error under the given
+   name. This function will be called by the codec when it encounters
+   unencodable characters/undecodable bytes and doesn't know the
+   callback name, when name is specified as the error parameter
+   in the call to the encode/decode function.
+   Return 0 on success, -1 on error */
+PyAPI_FUNC(int) PyCodec_RegisterError(const char *name, PyObject *error);
+
+/* Lookup the error handling callback function registered under the given
+   name. As a special case NULL can be passed, in which case
+   the error handling callback for "strict" will be returned. */
+PyAPI_FUNC(PyObject *) PyCodec_LookupError(const char *name);
+
+/* raise exc as an exception */
+PyAPI_FUNC(PyObject *) PyCodec_StrictErrors(PyObject *exc);
+
+/* ignore the unicode error, skipping the faulty input */
+PyAPI_FUNC(PyObject *) PyCodec_IgnoreErrors(PyObject *exc);
+
+/* replace the unicode encode error with ? or U+FFFD */
+PyAPI_FUNC(PyObject *) PyCodec_ReplaceErrors(PyObject *exc);
+
+/* replace the unicode encode error with XML character references */
+PyAPI_FUNC(PyObject *) PyCodec_XMLCharRefReplaceErrors(PyObject *exc);
+
+/* replace the unicode encode error with backslash escapes (\x, \u and \U) */
+PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* replace the unicode encode error with backslash escapes (\N, \x, \u and \U) */
+PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc);
+#endif
+
+#ifndef Py_LIMITED_API
+PyAPI_DATA(const char *) Py_hexdigits;
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CODECREGISTRY_H */
diff --git a/Include/compile.h b/Include/compile.h
new file mode 100644
index 0000000000000000000000000000000000000000..52d0bc76c9fca4485451ee4b18034059cf91cdb5
--- /dev/null
+++ b/Include/compile.h
@@ -0,0 +1,22 @@
+#ifndef Py_COMPILE_H
+#define Py_COMPILE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* These definitions must match corresponding definitions in graminit.h. */
+#define Py_single_input 256
+#define Py_file_input 257
+#define Py_eval_input 258
+#define Py_func_type_input 345
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_COMPILE_H
+#  include "cpython/compile.h"
+#  undef Py_CPYTHON_COMPILE_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_COMPILE_H */
diff --git a/Include/complexobject.h b/Include/complexobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebe49a832f74141c0cc059e72a15c01edf937dba
--- /dev/null
+++ b/Include/complexobject.h
@@ -0,0 +1,30 @@
+/* Complex number structure */
+
+#ifndef Py_COMPLEXOBJECT_H
+#define Py_COMPLEXOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Complex object interface */
+
+PyAPI_DATA(PyTypeObject) PyComplex_Type;
+
+#define PyComplex_Check(op) PyObject_TypeCheck((op), &PyComplex_Type)
+#define PyComplex_CheckExact(op) Py_IS_TYPE((op), &PyComplex_Type)
+
+PyAPI_FUNC(PyObject *) PyComplex_FromDoubles(double real, double imag);
+
+PyAPI_FUNC(double) PyComplex_RealAsDouble(PyObject *op);
+PyAPI_FUNC(double) PyComplex_ImagAsDouble(PyObject *op);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_COMPLEXOBJECT_H
+#  include "cpython/complexobject.h"
+#  undef Py_CPYTHON_COMPLEXOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_COMPLEXOBJECT_H */
diff --git a/Include/cpython/abstract.h b/Include/cpython/abstract.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e7b7a46703a6d08a3d73b16d1bfb7e164d53afb
--- /dev/null
+++ b/Include/cpython/abstract.h
@@ -0,0 +1,87 @@
+#ifndef Py_CPYTHON_ABSTRACTOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* === Object Protocol ================================================== */
+
+/* Like PyObject_CallMethod(), but expect a _Py_Identifier*
+   as the method name. */
+PyAPI_FUNC(PyObject*) _PyObject_CallMethodId(
+    PyObject *obj,
+    _Py_Identifier *name,
+    const char *format, ...);
+
+/* Convert keyword arguments from the FASTCALL (stack: C array, kwnames: tuple)
+   format to a Python dictionary ("kwargs" dict).
+
+   The type of kwnames keys is not checked. The final function getting
+   arguments is responsible to check if all keys are strings, for example using
+   PyArg_ParseTupleAndKeywords() or PyArg_ValidateKeywordArguments().
+
+   Duplicate keys are merged using the last value. If duplicate keys must raise
+   an exception, the caller is responsible to implement an explicit keys on
+   kwnames. */
+PyAPI_FUNC(PyObject*) _PyStack_AsDict(PyObject *const *values, PyObject *kwnames);
+
+
+/* === Vectorcall protocol (PEP 590) ============================= */
+
+// PyVectorcall_NARGS() is exported as a function for the stable ABI.
+// Here (when we are not using the stable ABI), the name is overridden to
+// call a static inline function for best performance.
+static inline Py_ssize_t
+_PyVectorcall_NARGS(size_t n)
+{
+    return n & ~PY_VECTORCALL_ARGUMENTS_OFFSET;
+}
+#define PyVectorcall_NARGS(n) _PyVectorcall_NARGS(n)
+
+PyAPI_FUNC(vectorcallfunc) PyVectorcall_Function(PyObject *callable);
+
+// Backwards compatibility aliases (PEP 590) for API that was provisional
+// in Python 3.8
+#define _PyObject_Vectorcall PyObject_Vectorcall
+#define _PyObject_VectorcallMethod PyObject_VectorcallMethod
+#define _PyObject_FastCallDict PyObject_VectorcallDict
+#define _PyVectorcall_Function PyVectorcall_Function
+#define _PyObject_CallOneArg PyObject_CallOneArg
+#define _PyObject_CallMethodNoArgs PyObject_CallMethodNoArgs
+#define _PyObject_CallMethodOneArg PyObject_CallMethodOneArg
+
+/* Same as PyObject_Vectorcall except that keyword arguments are passed as
+   dict, which may be NULL if there are no keyword arguments. */
+PyAPI_FUNC(PyObject *) PyObject_VectorcallDict(
+    PyObject *callable,
+    PyObject *const *args,
+    size_t nargsf,
+    PyObject *kwargs);
+
+PyAPI_FUNC(PyObject *) PyObject_CallOneArg(PyObject *func, PyObject *arg);
+
+static inline PyObject *
+PyObject_CallMethodNoArgs(PyObject *self, PyObject *name)
+{
+    size_t nargsf = 1 | PY_VECTORCALL_ARGUMENTS_OFFSET;
+    return PyObject_VectorcallMethod(name, &self, nargsf, _Py_NULL);
+}
+
+static inline PyObject *
+PyObject_CallMethodOneArg(PyObject *self, PyObject *name, PyObject *arg)
+{
+    PyObject *args[2] = {self, arg};
+    size_t nargsf = 2 | PY_VECTORCALL_ARGUMENTS_OFFSET;
+    assert(arg != NULL);
+    return PyObject_VectorcallMethod(name, args, nargsf, _Py_NULL);
+}
+
+/* Guess the size of object 'o' using len(o) or o.__length_hint__().
+   If neither of those return a non-negative value, then return the default
+   value.  If one of the calls fails, this function returns -1. */
+PyAPI_FUNC(Py_ssize_t) PyObject_LengthHint(PyObject *o, Py_ssize_t);
+
+/* === Sequence protocol ================================================ */
+
+/* Assume tp_as_sequence and sq_item exist and that 'i' does not
+   need to be corrected for a negative index. */
+#define PySequence_ITEM(o, i)\
+    ( Py_TYPE(o)->tp_as_sequence->sq_item((o), (i)) )
diff --git a/Include/cpython/bytearrayobject.h b/Include/cpython/bytearrayobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ba176eb2d3ac2aa650f764d991c37c2d42bd925
--- /dev/null
+++ b/Include/cpython/bytearrayobject.h
@@ -0,0 +1,34 @@
+#ifndef Py_CPYTHON_BYTEARRAYOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Object layout */
+typedef struct {
+    PyObject_VAR_HEAD
+    Py_ssize_t ob_alloc;   /* How many bytes allocated in ob_bytes */
+    char *ob_bytes;        /* Physical backing buffer */
+    char *ob_start;        /* Logical start inside ob_bytes */
+    Py_ssize_t ob_exports; /* How many buffer exports */
+} PyByteArrayObject;
+
+PyAPI_DATA(char) _PyByteArray_empty_string[];
+
+/* Macros and static inline functions, trading safety for speed */
+#define _PyByteArray_CAST(op) \
+    (assert(PyByteArray_Check(op)), _Py_CAST(PyByteArrayObject*, op))
+
+static inline char* PyByteArray_AS_STRING(PyObject *op)
+{
+    PyByteArrayObject *self = _PyByteArray_CAST(op);
+    if (Py_SIZE(self)) {
+        return self->ob_start;
+    }
+    return _PyByteArray_empty_string;
+}
+#define PyByteArray_AS_STRING(self) PyByteArray_AS_STRING(_PyObject_CAST(self))
+
+static inline Py_ssize_t PyByteArray_GET_SIZE(PyObject *op) {
+    PyByteArrayObject *self = _PyByteArray_CAST(op);
+    return Py_SIZE(self);
+}
+#define PyByteArray_GET_SIZE(self) PyByteArray_GET_SIZE(_PyObject_CAST(self))
diff --git a/Include/cpython/bytesobject.h b/Include/cpython/bytesobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..41537210b748a1c51b15f9226d14e33cdcfd43b2
--- /dev/null
+++ b/Include/cpython/bytesobject.h
@@ -0,0 +1,37 @@
+#ifndef Py_CPYTHON_BYTESOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct {
+    PyObject_VAR_HEAD
+    Py_DEPRECATED(3.11) Py_hash_t ob_shash;
+    char ob_sval[1];
+
+    /* Invariants:
+     *     ob_sval contains space for 'ob_size+1' elements.
+     *     ob_sval[ob_size] == 0.
+     *     ob_shash is the hash of the byte string or -1 if not computed yet.
+     */
+} PyBytesObject;
+
+PyAPI_FUNC(int) _PyBytes_Resize(PyObject **, Py_ssize_t);
+
+/* Macros and static inline functions, trading safety for speed */
+#define _PyBytes_CAST(op) \
+    (assert(PyBytes_Check(op)), _Py_CAST(PyBytesObject*, op))
+
+static inline char* PyBytes_AS_STRING(PyObject *op)
+{
+    return _PyBytes_CAST(op)->ob_sval;
+}
+#define PyBytes_AS_STRING(op) PyBytes_AS_STRING(_PyObject_CAST(op))
+
+static inline Py_ssize_t PyBytes_GET_SIZE(PyObject *op) {
+    PyBytesObject *self = _PyBytes_CAST(op);
+    return Py_SIZE(self);
+}
+#define PyBytes_GET_SIZE(self) PyBytes_GET_SIZE(_PyObject_CAST(self))
+
+/* _PyBytes_Join(sep, x) is like sep.join(x).  sep must be PyBytesObject*,
+   x must be an iterable object. */
+PyAPI_FUNC(PyObject*) _PyBytes_Join(PyObject *sep, PyObject *x);
diff --git a/Include/cpython/cellobject.h b/Include/cpython/cellobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..47a6a491497ea030863fb71b7b21db5560f77ed1
--- /dev/null
+++ b/Include/cpython/cellobject.h
@@ -0,0 +1,44 @@
+/* Cell object interface */
+
+#ifndef Py_LIMITED_API
+#ifndef Py_CELLOBJECT_H
+#define Py_CELLOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    /* Content of the cell or NULL when empty */
+    PyObject *ob_ref;
+} PyCellObject;
+
+PyAPI_DATA(PyTypeObject) PyCell_Type;
+
+#define PyCell_Check(op) Py_IS_TYPE((op), &PyCell_Type)
+
+PyAPI_FUNC(PyObject *) PyCell_New(PyObject *);
+PyAPI_FUNC(PyObject *) PyCell_Get(PyObject *);
+PyAPI_FUNC(int) PyCell_Set(PyObject *, PyObject *);
+
+static inline PyObject* PyCell_GET(PyObject *op) {
+    PyCellObject *cell;
+    assert(PyCell_Check(op));
+    cell = _Py_CAST(PyCellObject*, op);
+    return cell->ob_ref;
+}
+#define PyCell_GET(op) PyCell_GET(_PyObject_CAST(op))
+
+static inline void PyCell_SET(PyObject *op, PyObject *value) {
+    PyCellObject *cell;
+    assert(PyCell_Check(op));
+    cell = _Py_CAST(PyCellObject*, op);
+    cell->ob_ref = value;
+}
+#define PyCell_SET(op, value) PyCell_SET(_PyObject_CAST(op), (value))
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TUPLEOBJECT_H */
+#endif /* Py_LIMITED_API */
diff --git a/Include/cpython/ceval.h b/Include/cpython/ceval.h
new file mode 100644
index 0000000000000000000000000000000000000000..78f7405661662f8c6e6f0c76c3667d902ffbc837
--- /dev/null
+++ b/Include/cpython/ceval.h
@@ -0,0 +1,25 @@
+#ifndef Py_CPYTHON_CEVAL_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(void) PyEval_SetProfile(Py_tracefunc, PyObject *);
+PyAPI_FUNC(void) PyEval_SetProfileAllThreads(Py_tracefunc, PyObject *);
+PyAPI_FUNC(void) PyEval_SetTrace(Py_tracefunc, PyObject *);
+PyAPI_FUNC(void) PyEval_SetTraceAllThreads(Py_tracefunc, PyObject *);
+
+/* Look at the current frame's (if any) code's co_flags, and turn on
+   the corresponding compiler flags in cf->cf_flags.  Return 1 if any
+   flag was set, else return 0. */
+PyAPI_FUNC(int) PyEval_MergeCompilerFlags(PyCompilerFlags *cf);
+
+PyAPI_FUNC(PyObject *) _PyEval_EvalFrameDefault(PyThreadState *tstate, struct _PyInterpreterFrame *f, int exc);
+
+PyAPI_FUNC(Py_ssize_t) PyUnstable_Eval_RequestCodeExtraIndex(freefunc);
+// Old name -- remove when this API changes:
+_Py_DEPRECATED_EXTERNALLY(3.12) static inline Py_ssize_t
+_PyEval_RequestCodeExtraIndex(freefunc f) {
+    return PyUnstable_Eval_RequestCodeExtraIndex(f);
+}
+
+PyAPI_FUNC(int) _PyEval_SliceIndex(PyObject *, Py_ssize_t *);
+PyAPI_FUNC(int) _PyEval_SliceIndexNotNone(PyObject *, Py_ssize_t *);
diff --git a/Include/cpython/classobject.h b/Include/cpython/classobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7c9ddd1336c46d8a57898cdcfcb0731c0cfb6a9
--- /dev/null
+++ b/Include/cpython/classobject.h
@@ -0,0 +1,71 @@
+/* Former class object interface -- now only bound methods are here  */
+
+/* Revealing some structures (not for general use) */
+
+#ifndef Py_LIMITED_API
+#ifndef Py_CLASSOBJECT_H
+#define Py_CLASSOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *im_func;   /* The callable object implementing the method */
+    PyObject *im_self;   /* The instance it is bound to */
+    PyObject *im_weakreflist; /* List of weak references */
+    vectorcallfunc vectorcall;
+} PyMethodObject;
+
+PyAPI_DATA(PyTypeObject) PyMethod_Type;
+
+#define PyMethod_Check(op) Py_IS_TYPE((op), &PyMethod_Type)
+
+PyAPI_FUNC(PyObject *) PyMethod_New(PyObject *, PyObject *);
+
+PyAPI_FUNC(PyObject *) PyMethod_Function(PyObject *);
+PyAPI_FUNC(PyObject *) PyMethod_Self(PyObject *);
+
+#define _PyMethod_CAST(meth) \
+    (assert(PyMethod_Check(meth)), _Py_CAST(PyMethodObject*, meth))
+
+/* Static inline functions for direct access to these values.
+   Type checks are *not* done, so use with care. */
+static inline PyObject* PyMethod_GET_FUNCTION(PyObject *meth) {
+    return _PyMethod_CAST(meth)->im_func;
+}
+#define PyMethod_GET_FUNCTION(meth) PyMethod_GET_FUNCTION(_PyObject_CAST(meth))
+
+static inline PyObject* PyMethod_GET_SELF(PyObject *meth) {
+    return _PyMethod_CAST(meth)->im_self;
+}
+#define PyMethod_GET_SELF(meth) PyMethod_GET_SELF(_PyObject_CAST(meth))
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *func;
+} PyInstanceMethodObject;
+
+PyAPI_DATA(PyTypeObject) PyInstanceMethod_Type;
+
+#define PyInstanceMethod_Check(op) Py_IS_TYPE((op), &PyInstanceMethod_Type)
+
+PyAPI_FUNC(PyObject *) PyInstanceMethod_New(PyObject *);
+PyAPI_FUNC(PyObject *) PyInstanceMethod_Function(PyObject *);
+
+#define _PyInstanceMethod_CAST(meth) \
+    (assert(PyInstanceMethod_Check(meth)), \
+     _Py_CAST(PyInstanceMethodObject*, meth))
+
+/* Static inline function for direct access to these values.
+   Type checks are *not* done, so use with care. */
+static inline PyObject* PyInstanceMethod_GET_FUNCTION(PyObject *meth) {
+    return _PyInstanceMethod_CAST(meth)->func;
+}
+#define PyInstanceMethod_GET_FUNCTION(meth) PyInstanceMethod_GET_FUNCTION(_PyObject_CAST(meth))
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_CLASSOBJECT_H
+#endif   // !Py_LIMITED_API
diff --git a/Include/cpython/code.h b/Include/cpython/code.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd8afab8f93ca071b264880ef6b64f1f8424bc32
--- /dev/null
+++ b/Include/cpython/code.h
@@ -0,0 +1,358 @@
+/* Definitions for bytecode */
+
+#ifndef Py_LIMITED_API
+#ifndef Py_CODE_H
+#define Py_CODE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Count of all local monitoring events */
+#define  _PY_MONITORING_LOCAL_EVENTS 10
+/* Count of all "real" monitoring events (not derived from other events) */
+#define _PY_MONITORING_UNGROUPED_EVENTS 15
+/* Count of all  monitoring events */
+#define _PY_MONITORING_EVENTS 17
+
+/* Tables of which tools are active for each monitored event. */
+typedef struct _Py_LocalMonitors {
+    uint8_t tools[_PY_MONITORING_LOCAL_EVENTS];
+} _Py_LocalMonitors;
+
+typedef struct _Py_GlobalMonitors {
+    uint8_t tools[_PY_MONITORING_UNGROUPED_EVENTS];
+} _Py_GlobalMonitors;
+
+
+typedef struct {
+    PyObject *_co_code;
+    PyObject *_co_varnames;
+    PyObject *_co_cellvars;
+    PyObject *_co_freevars;
+} _PyCoCached;
+
+/* Ancillary data structure used for instrumentation.
+   Line instrumentation creates this with sufficient
+   space for one entry per code unit. The total size
+   of the data will be `bytes_per_entry * Py_SIZE(code)` */
+typedef struct {
+    uint8_t bytes_per_entry;
+    uint8_t data[1];
+} _PyCoLineInstrumentationData;
+
+
+typedef struct {
+    int size;
+    int capacity;
+    struct _PyExecutorObject *executors[1];
+} _PyExecutorArray;
+
+/* Main data structure used for instrumentation.
+ * This is allocated when needed for instrumentation
+ */
+typedef struct {
+    /* Monitoring specific to this code object */
+    _Py_LocalMonitors local_monitors;
+    /* Monitoring that is active on this code object */
+    _Py_LocalMonitors active_monitors;
+    /* The tools that are to be notified for events for the matching code unit */
+    uint8_t *tools;
+    /* Information to support line events */
+    _PyCoLineInstrumentationData *lines;
+    /* The tools that are to be notified for line events for the matching code unit */
+    uint8_t *line_tools;
+    /* Information to support instruction events */
+    /* The underlying instructions, which can themselves be instrumented */
+    uint8_t *per_instruction_opcodes;
+    /* The tools that are to be notified for instruction events for the matching code unit */
+    uint8_t *per_instruction_tools;
+} _PyCoMonitoringData;
+
+// To avoid repeating ourselves in deepfreeze.py, all PyCodeObject members are
+// defined in this macro:
+#define _PyCode_DEF(SIZE) {                                                    \
+    PyObject_VAR_HEAD                                                          \
+                                                                               \
+    /* Note only the following fields are used in hash and/or comparisons      \
+     *                                                                         \
+     * - co_name                                                               \
+     * - co_argcount                                                           \
+     * - co_posonlyargcount                                                    \
+     * - co_kwonlyargcount                                                     \
+     * - co_nlocals                                                            \
+     * - co_stacksize                                                          \
+     * - co_flags                                                              \
+     * - co_firstlineno                                                        \
+     * - co_consts                                                             \
+     * - co_names                                                              \
+     * - co_localsplusnames                                                    \
+     * This is done to preserve the name and line number for tracebacks        \
+     * and debuggers; otherwise, constant de-duplication would collapse        \
+     * identical functions/lambdas defined on different lines.                 \
+     */                                                                        \
+                                                                               \
+    /* These fields are set with provided values on new code objects. */       \
+                                                                               \
+    /* The hottest fields (in the eval loop) are grouped here at the top. */   \
+    PyObject *co_consts;           /* list (constants used) */                 \
+    PyObject *co_names;            /* list of strings (names used) */          \
+    PyObject *co_exceptiontable;   /* Byte string encoding exception handling  \
+                                      table */                                 \
+    int co_flags;                  /* CO_..., see below */                     \
+                                                                               \
+    /* The rest are not so impactful on performance. */                        \
+    int co_argcount;              /* #arguments, except *args */               \
+    int co_posonlyargcount;       /* #positional only arguments */             \
+    int co_kwonlyargcount;        /* #keyword only arguments */                \
+    int co_stacksize;             /* #entries needed for evaluation stack */   \
+    int co_firstlineno;           /* first source line number */               \
+                                                                               \
+    /* redundant values (derived from co_localsplusnames and                   \
+       co_localspluskinds) */                                                  \
+    int co_nlocalsplus;           /* number of local + cell + free variables */ \
+    int co_framesize;             /* Size of frame in words */                 \
+    int co_nlocals;               /* number of local variables */              \
+    int co_ncellvars;             /* total number of cell variables */         \
+    int co_nfreevars;             /* number of free variables */               \
+    uint32_t co_version;          /* version number */                         \
+                                                                               \
+    PyObject *co_localsplusnames; /* tuple mapping offsets to names */         \
+    PyObject *co_localspluskinds; /* Bytes mapping to local kinds (one byte    \
+                                     per variable) */                          \
+    PyObject *co_filename;        /* unicode (where it was loaded from) */     \
+    PyObject *co_name;            /* unicode (name, for reference) */          \
+    PyObject *co_qualname;        /* unicode (qualname, for reference) */      \
+    PyObject *co_linetable;       /* bytes object that holds location info */  \
+    PyObject *co_weakreflist;     /* to support weakrefs to code objects */    \
+    _PyExecutorArray *co_executors;      /* executors from optimizer */        \
+    _PyCoCached *_co_cached;      /* cached co_* attributes */                 \
+    uintptr_t _co_instrumentation_version; /* current instrumentation version */ \
+    _PyCoMonitoringData *_co_monitoring; /* Monitoring data */                 \
+    int _co_firsttraceable;       /* index of first traceable instruction */   \
+    /* Scratch space for extra data relating to the code object.               \
+       Type is a void* to keep the format private in codeobject.c to force     \
+       people to go through the proper APIs. */                                \
+    void *co_extra;                                                            \
+    char co_code_adaptive[(SIZE)];                                             \
+}
+
+/* Bytecode object */
+struct PyCodeObject _PyCode_DEF(1);
+
+/* Masks for co_flags above */
+#define CO_OPTIMIZED    0x0001
+#define CO_NEWLOCALS    0x0002
+#define CO_VARARGS      0x0004
+#define CO_VARKEYWORDS  0x0008
+#define CO_NESTED       0x0010
+#define CO_GENERATOR    0x0020
+
+/* The CO_COROUTINE flag is set for coroutine functions (defined with
+   ``async def`` keywords) */
+#define CO_COROUTINE            0x0080
+#define CO_ITERABLE_COROUTINE   0x0100
+#define CO_ASYNC_GENERATOR      0x0200
+
+/* bpo-39562: These constant values are changed in Python 3.9
+   to prevent collision with compiler flags. CO_FUTURE_ and PyCF_
+   constants must be kept unique. PyCF_ constants can use bits from
+   0x0100 to 0x10000. CO_FUTURE_ constants use bits starting at 0x20000. */
+#define CO_FUTURE_DIVISION      0x20000
+#define CO_FUTURE_ABSOLUTE_IMPORT 0x40000 /* do absolute imports by default */
+#define CO_FUTURE_WITH_STATEMENT  0x80000
+#define CO_FUTURE_PRINT_FUNCTION  0x100000
+#define CO_FUTURE_UNICODE_LITERALS 0x200000
+
+#define CO_FUTURE_BARRY_AS_BDFL  0x400000
+#define CO_FUTURE_GENERATOR_STOP  0x800000
+#define CO_FUTURE_ANNOTATIONS    0x1000000
+
+#define CO_NO_MONITORING_EVENTS 0x2000000
+
+/* This should be defined if a future statement modifies the syntax.
+   For example, when a keyword is added.
+*/
+#define PY_PARSER_REQUIRES_FUTURE_KEYWORD
+
+#define CO_MAXBLOCKS 21 /* Max static block nesting within a function */
+
+PyAPI_DATA(PyTypeObject) PyCode_Type;
+
+#define PyCode_Check(op) Py_IS_TYPE((op), &PyCode_Type)
+
+static inline Py_ssize_t PyCode_GetNumFree(PyCodeObject *op) {
+    assert(PyCode_Check(op));
+    return op->co_nfreevars;
+}
+
+static inline int PyUnstable_Code_GetFirstFree(PyCodeObject *op) {
+    assert(PyCode_Check(op));
+    return op->co_nlocalsplus - op->co_nfreevars;
+}
+
+Py_DEPRECATED(3.13) static inline int PyCode_GetFirstFree(PyCodeObject *op) {
+    return PyUnstable_Code_GetFirstFree(op);
+}
+
+/* Unstable public interface */
+PyAPI_FUNC(PyCodeObject *) PyUnstable_Code_New(
+        int, int, int, int, int, PyObject *, PyObject *,
+        PyObject *, PyObject *, PyObject *, PyObject *,
+        PyObject *, PyObject *, PyObject *, int, PyObject *,
+        PyObject *);
+
+PyAPI_FUNC(PyCodeObject *) PyUnstable_Code_NewWithPosOnlyArgs(
+        int, int, int, int, int, int, PyObject *, PyObject *,
+        PyObject *, PyObject *, PyObject *, PyObject *,
+        PyObject *, PyObject *, PyObject *, int, PyObject *,
+        PyObject *);
+        /* same as struct above */
+// Old names -- remove when this API changes:
+_Py_DEPRECATED_EXTERNALLY(3.12) static inline PyCodeObject *
+PyCode_New(
+        int a, int b, int c, int d, int e, PyObject *f, PyObject *g,
+        PyObject *h, PyObject *i, PyObject *j, PyObject *k,
+        PyObject *l, PyObject *m, PyObject *n, int o, PyObject *p,
+        PyObject *q)
+{
+    return PyUnstable_Code_New(
+        a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q);
+}
+_Py_DEPRECATED_EXTERNALLY(3.12) static inline PyCodeObject *
+PyCode_NewWithPosOnlyArgs(
+        int a, int poac, int b, int c, int d, int e, PyObject *f, PyObject *g,
+        PyObject *h, PyObject *i, PyObject *j, PyObject *k,
+        PyObject *l, PyObject *m, PyObject *n, int o, PyObject *p,
+        PyObject *q)
+{
+    return PyUnstable_Code_NewWithPosOnlyArgs(
+        a, poac, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q);
+}
+
+/* Creates a new empty code object with the specified source location. */
+PyAPI_FUNC(PyCodeObject *)
+PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno);
+
+/* Return the line number associated with the specified bytecode index
+   in this code object.  If you just need the line number of a frame,
+   use PyFrame_GetLineNumber() instead. */
+PyAPI_FUNC(int) PyCode_Addr2Line(PyCodeObject *, int);
+
+PyAPI_FUNC(int) PyCode_Addr2Location(PyCodeObject *, int, int *, int *, int *, int *);
+
+#define PY_FOREACH_CODE_EVENT(V) \
+    V(CREATE)                 \
+    V(DESTROY)
+
+typedef enum {
+    #define PY_DEF_EVENT(op) PY_CODE_EVENT_##op,
+    PY_FOREACH_CODE_EVENT(PY_DEF_EVENT)
+    #undef PY_DEF_EVENT
+} PyCodeEvent;
+
+
+/*
+ * A callback that is invoked for different events in a code object's lifecycle.
+ *
+ * The callback is invoked with a borrowed reference to co, after it is
+ * created and before it is destroyed.
+ *
+ * If the callback sets an exception, it must return -1. Otherwise
+ * it should return 0.
+ */
+typedef int (*PyCode_WatchCallback)(
+  PyCodeEvent event,
+  PyCodeObject* co);
+
+/*
+ * Register a per-interpreter callback that will be invoked for code object
+ * lifecycle events.
+ *
+ * Returns a handle that may be passed to PyCode_ClearWatcher on success,
+ * or -1 and sets an error if no more handles are available.
+ */
+PyAPI_FUNC(int) PyCode_AddWatcher(PyCode_WatchCallback callback);
+
+/*
+ * Clear the watcher associated with the watcher_id handle.
+ *
+ * Returns 0 on success or -1 if no watcher exists for the provided id.
+ */
+PyAPI_FUNC(int) PyCode_ClearWatcher(int watcher_id);
+
+/* for internal use only */
+struct _opaque {
+    int computed_line;
+    const uint8_t *lo_next;
+    const uint8_t *limit;
+};
+
+typedef struct _line_offsets {
+    int ar_start;
+    int ar_end;
+    int ar_line;
+    struct _opaque opaque;
+} PyCodeAddressRange;
+
+/* Update *bounds to describe the first and one-past-the-last instructions in the
+   same line as lasti.  Return the number of that line.
+*/
+PyAPI_FUNC(int) _PyCode_CheckLineNumber(int lasti, PyCodeAddressRange *bounds);
+
+/* Create a comparable key used to compare constants taking in account the
+ * object type. It is used to make sure types are not coerced (e.g., float and
+ * complex) _and_ to distinguish 0.0 from -0.0 e.g. on IEEE platforms
+ *
+ * Return (type(obj), obj, ...): a tuple with variable size (at least 2 items)
+ * depending on the type and the value. The type is the first item to not
+ * compare bytes and str which can raise a BytesWarning exception. */
+PyAPI_FUNC(PyObject*) _PyCode_ConstantKey(PyObject *obj);
+
+PyAPI_FUNC(PyObject*) PyCode_Optimize(PyObject *code, PyObject* consts,
+                                      PyObject *names, PyObject *lnotab);
+
+PyAPI_FUNC(int) PyUnstable_Code_GetExtra(
+    PyObject *code, Py_ssize_t index, void **extra);
+PyAPI_FUNC(int) PyUnstable_Code_SetExtra(
+    PyObject *code, Py_ssize_t index, void *extra);
+// Old names -- remove when this API changes:
+_Py_DEPRECATED_EXTERNALLY(3.12) static inline int
+_PyCode_GetExtra(PyObject *code, Py_ssize_t index, void **extra)
+{
+    return PyUnstable_Code_GetExtra(code, index, extra);
+}
+_Py_DEPRECATED_EXTERNALLY(3.12) static inline int
+_PyCode_SetExtra(PyObject *code, Py_ssize_t index, void *extra)
+{
+    return PyUnstable_Code_SetExtra(code, index, extra);
+}
+
+/* Equivalent to getattr(code, 'co_code') in Python.
+   Returns a strong reference to a bytes object. */
+PyAPI_FUNC(PyObject *) PyCode_GetCode(PyCodeObject *code);
+/* Equivalent to getattr(code, 'co_varnames') in Python. */
+PyAPI_FUNC(PyObject *) PyCode_GetVarnames(PyCodeObject *code);
+/* Equivalent to getattr(code, 'co_cellvars') in Python. */
+PyAPI_FUNC(PyObject *) PyCode_GetCellvars(PyCodeObject *code);
+/* Equivalent to getattr(code, 'co_freevars') in Python. */
+PyAPI_FUNC(PyObject *) PyCode_GetFreevars(PyCodeObject *code);
+
+typedef enum _PyCodeLocationInfoKind {
+    /* short forms are 0 to 9 */
+    PY_CODE_LOCATION_INFO_SHORT0 = 0,
+    /* one lineforms are 10 to 12 */
+    PY_CODE_LOCATION_INFO_ONE_LINE0 = 10,
+    PY_CODE_LOCATION_INFO_ONE_LINE1 = 11,
+    PY_CODE_LOCATION_INFO_ONE_LINE2 = 12,
+
+    PY_CODE_LOCATION_INFO_NO_COLUMNS = 13,
+    PY_CODE_LOCATION_INFO_LONG = 14,
+    PY_CODE_LOCATION_INFO_NONE = 15
+} _PyCodeLocationInfoKind;
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_CODE_H
+#endif  // !Py_LIMITED_API
diff --git a/Include/cpython/compile.h b/Include/cpython/compile.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfdb7080d45f2b166082df0be2f1508a3eef9946
--- /dev/null
+++ b/Include/cpython/compile.h
@@ -0,0 +1,50 @@
+#ifndef Py_CPYTHON_COMPILE_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Public interface */
+#define PyCF_MASK (CO_FUTURE_DIVISION | CO_FUTURE_ABSOLUTE_IMPORT | \
+                   CO_FUTURE_WITH_STATEMENT | CO_FUTURE_PRINT_FUNCTION | \
+                   CO_FUTURE_UNICODE_LITERALS | CO_FUTURE_BARRY_AS_BDFL | \
+                   CO_FUTURE_GENERATOR_STOP | CO_FUTURE_ANNOTATIONS)
+#define PyCF_MASK_OBSOLETE (CO_NESTED)
+
+/* bpo-39562: CO_FUTURE_ and PyCF_ constants must be kept unique.
+   PyCF_ constants can use bits from 0x0100 to 0x10000.
+   CO_FUTURE_ constants use bits starting at 0x20000. */
+#define PyCF_SOURCE_IS_UTF8  0x0100
+#define PyCF_DONT_IMPLY_DEDENT 0x0200
+#define PyCF_ONLY_AST 0x0400
+#define PyCF_IGNORE_COOKIE 0x0800
+#define PyCF_TYPE_COMMENTS 0x1000
+#define PyCF_ALLOW_TOP_LEVEL_AWAIT 0x2000
+#define PyCF_ALLOW_INCOMPLETE_INPUT 0x4000
+#define PyCF_OPTIMIZED_AST (0x8000 | PyCF_ONLY_AST)
+#define PyCF_COMPILE_MASK (PyCF_ONLY_AST | PyCF_ALLOW_TOP_LEVEL_AWAIT | \
+                           PyCF_TYPE_COMMENTS | PyCF_DONT_IMPLY_DEDENT | \
+                           PyCF_ALLOW_INCOMPLETE_INPUT | PyCF_OPTIMIZED_AST)
+
+typedef struct {
+    int cf_flags;  /* bitmask of CO_xxx flags relevant to future */
+    int cf_feature_version;  /* minor Python version (PyCF_ONLY_AST) */
+} PyCompilerFlags;
+
+#define _PyCompilerFlags_INIT \
+    (PyCompilerFlags){.cf_flags = 0, .cf_feature_version = PY_MINOR_VERSION}
+
+/* Future feature support */
+
+#define FUTURE_NESTED_SCOPES "nested_scopes"
+#define FUTURE_GENERATORS "generators"
+#define FUTURE_DIVISION "division"
+#define FUTURE_ABSOLUTE_IMPORT "absolute_import"
+#define FUTURE_WITH_STATEMENT "with_statement"
+#define FUTURE_PRINT_FUNCTION "print_function"
+#define FUTURE_UNICODE_LITERALS "unicode_literals"
+#define FUTURE_BARRY_AS_BDFL "barry_as_FLUFL"
+#define FUTURE_GENERATOR_STOP "generator_stop"
+#define FUTURE_ANNOTATIONS "annotations"
+
+#define PY_INVALID_STACK_EFFECT INT_MAX
+PyAPI_FUNC(int) PyCompile_OpcodeStackEffect(int opcode, int oparg);
+PyAPI_FUNC(int) PyCompile_OpcodeStackEffectWithJump(int opcode, int oparg, int jump);
diff --git a/Include/cpython/complexobject.h b/Include/cpython/complexobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbdc6a91fe895c0f7af0bff6a3836a1991709c20
--- /dev/null
+++ b/Include/cpython/complexobject.h
@@ -0,0 +1,33 @@
+#ifndef Py_CPYTHON_COMPLEXOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct {
+    double real;
+    double imag;
+} Py_complex;
+
+// Operations on complex numbers.
+PyAPI_FUNC(Py_complex) _Py_c_sum(Py_complex, Py_complex);
+PyAPI_FUNC(Py_complex) _Py_c_diff(Py_complex, Py_complex);
+PyAPI_FUNC(Py_complex) _Py_c_neg(Py_complex);
+PyAPI_FUNC(Py_complex) _Py_c_prod(Py_complex, Py_complex);
+PyAPI_FUNC(Py_complex) _Py_c_quot(Py_complex, Py_complex);
+PyAPI_FUNC(Py_complex) _Py_c_pow(Py_complex, Py_complex);
+PyAPI_FUNC(double) _Py_c_abs(Py_complex);
+
+
+/* Complex object interface */
+
+/*
+PyComplexObject represents a complex number with double-precision
+real and imaginary parts.
+*/
+typedef struct {
+    PyObject_HEAD
+    Py_complex cval;
+} PyComplexObject;
+
+PyAPI_FUNC(PyObject *) PyComplex_FromCComplex(Py_complex);
+
+PyAPI_FUNC(Py_complex) PyComplex_AsCComplex(PyObject *op);
diff --git a/Include/cpython/context.h b/Include/cpython/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3249fc29b082e4230488e9345948eaa7f502c09
--- /dev/null
+++ b/Include/cpython/context.h
@@ -0,0 +1,74 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_CONTEXT_H
+#define Py_CONTEXT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyContext_Type;
+typedef struct _pycontextobject PyContext;
+
+PyAPI_DATA(PyTypeObject) PyContextVar_Type;
+typedef struct _pycontextvarobject PyContextVar;
+
+PyAPI_DATA(PyTypeObject) PyContextToken_Type;
+typedef struct _pycontexttokenobject PyContextToken;
+
+
+#define PyContext_CheckExact(o) Py_IS_TYPE((o), &PyContext_Type)
+#define PyContextVar_CheckExact(o) Py_IS_TYPE((o), &PyContextVar_Type)
+#define PyContextToken_CheckExact(o) Py_IS_TYPE((o), &PyContextToken_Type)
+
+
+PyAPI_FUNC(PyObject *) PyContext_New(void);
+PyAPI_FUNC(PyObject *) PyContext_Copy(PyObject *);
+PyAPI_FUNC(PyObject *) PyContext_CopyCurrent(void);
+
+PyAPI_FUNC(int) PyContext_Enter(PyObject *);
+PyAPI_FUNC(int) PyContext_Exit(PyObject *);
+
+
+/* Create a new context variable.
+
+   default_value can be NULL.
+*/
+PyAPI_FUNC(PyObject *) PyContextVar_New(
+    const char *name, PyObject *default_value);
+
+
+/* Get a value for the variable.
+
+   Returns -1 if an error occurred during lookup.
+
+   Returns 0 if value either was or was not found.
+
+   If value was found, *value will point to it.
+   If not, it will point to:
+
+   - default_value, if not NULL;
+   - the default value of "var", if not NULL;
+   - NULL.
+
+   '*value' will be a new ref, if not NULL.
+*/
+PyAPI_FUNC(int) PyContextVar_Get(
+    PyObject *var, PyObject *default_value, PyObject **value);
+
+
+/* Set a new value for the variable.
+   Returns NULL if an error occurs.
+*/
+PyAPI_FUNC(PyObject *) PyContextVar_Set(PyObject *var, PyObject *value);
+
+
+/* Reset a variable to its previous value.
+   Returns 0 on success, -1 on error.
+*/
+PyAPI_FUNC(int) PyContextVar_Reset(PyObject *var, PyObject *token);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CONTEXT_H */
+#endif /* !Py_LIMITED_API */
diff --git a/Include/cpython/critical_section.h b/Include/cpython/critical_section.h
new file mode 100644
index 0000000000000000000000000000000000000000..35db3fb6a59ce66c23aa5efa948ef76a3f489d4d
--- /dev/null
+++ b/Include/cpython/critical_section.h
@@ -0,0 +1,134 @@
+#ifndef Py_CPYTHON_CRITICAL_SECTION_H
+#  error "this header file must not be included directly"
+#endif
+
+// Python critical sections
+//
+// Conceptually, critical sections are a deadlock avoidance layer on top of
+// per-object locks. These helpers, in combination with those locks, replace
+// our usage of the global interpreter lock to provide thread-safety for
+// otherwise thread-unsafe objects, such as dict.
+//
+// NOTE: These APIs are no-ops in non-free-threaded builds.
+//
+// Straightforward per-object locking could introduce deadlocks that were not
+// present when running with the GIL. Threads may hold locks for multiple
+// objects simultaneously because Python operations can nest. If threads were
+// to acquire the same locks in different orders, they would deadlock.
+//
+// One way to avoid deadlocks is to allow threads to hold only the lock (or
+// locks) for a single operation at a time (typically a single lock, but some
+// operations involve two locks). When a thread begins a nested operation it
+// could suspend the locks for any outer operation: before beginning the nested
+// operation, the locks for the outer operation are released and when the
+// nested operation completes, the locks for the outer operation are
+// reacquired.
+//
+// To improve performance, this API uses a variation of the above scheme.
+// Instead of immediately suspending locks any time a nested operation begins,
+// locks are only suspended if the thread would block. This reduces the number
+// of lock acquisitions and releases for nested operations, while still
+// avoiding deadlocks.
+//
+// Additionally, the locks for any active operation are suspended around
+// other potentially blocking operations, such as I/O. This is because the
+// interaction between locks and blocking operations can lead to deadlocks in
+// the same way as the interaction between multiple locks.
+//
+// Each thread's critical sections and their corresponding locks are tracked in
+// a stack in `PyThreadState.critical_section`. When a thread calls
+// `_PyThreadState_Detach()`, such as before a blocking I/O operation or when
+// waiting to acquire a lock, the thread suspends all of its active critical
+// sections, temporarily releasing the associated locks. When the thread calls
+// `_PyThreadState_Attach()`, it resumes the top-most (i.e., most recent)
+// critical section by reacquiring the associated lock or locks.  See
+// `_PyCriticalSection_Resume()`.
+//
+// NOTE: Only the top-most critical section is guaranteed to be active.
+// Operations that need to lock two objects at once must use
+// `Py_BEGIN_CRITICAL_SECTION2()`. You *CANNOT* use nested critical sections
+// to lock more than one object at once, because the inner critical section
+// may  suspend the outer critical sections. This API does not provide a way
+// to lock more than two objects at once (though it could be added later
+// if actually needed).
+//
+// NOTE: Critical sections implicitly behave like reentrant locks because
+// attempting to acquire the same lock will suspend any outer (earlier)
+// critical sections. However, they are less efficient for this use case than
+// purposefully designed reentrant locks.
+//
+// Example usage:
+//  Py_BEGIN_CRITICAL_SECTION(op);
+//  ...
+//  Py_END_CRITICAL_SECTION();
+//
+// To lock two objects at once:
+//  Py_BEGIN_CRITICAL_SECTION2(op1, op2);
+//  ...
+//  Py_END_CRITICAL_SECTION2();
+
+typedef struct PyCriticalSection PyCriticalSection;
+typedef struct PyCriticalSection2 PyCriticalSection2;
+
+PyAPI_FUNC(void)
+PyCriticalSection_Begin(PyCriticalSection *c, PyObject *op);
+
+PyAPI_FUNC(void)
+PyCriticalSection_End(PyCriticalSection *c);
+
+PyAPI_FUNC(void)
+PyCriticalSection2_Begin(PyCriticalSection2 *c, PyObject *a, PyObject *b);
+
+PyAPI_FUNC(void)
+PyCriticalSection2_End(PyCriticalSection2 *c);
+
+#ifndef Py_GIL_DISABLED
+# define Py_BEGIN_CRITICAL_SECTION(op)      \
+    {
+# define Py_END_CRITICAL_SECTION()          \
+    }
+# define Py_BEGIN_CRITICAL_SECTION2(a, b)   \
+    {
+# define Py_END_CRITICAL_SECTION2()         \
+    }
+#else /* !Py_GIL_DISABLED */
+
+// NOTE: the contents of this struct are private and may change betweeen
+// Python releases without a deprecation period.
+struct PyCriticalSection {
+    // Tagged pointer to an outer active critical section (or 0).
+    uintptr_t _cs_prev;
+
+    // Mutex used to protect critical section
+    PyMutex *_cs_mutex;
+};
+
+// A critical section protected by two mutexes. Use
+// Py_BEGIN_CRITICAL_SECTION2 and Py_END_CRITICAL_SECTION2.
+// NOTE: the contents of this struct are private and may change betweeen
+// Python releases without a deprecation period.
+struct PyCriticalSection2 {
+    PyCriticalSection _cs_base;
+
+    PyMutex *_cs_mutex2;
+};
+
+# define Py_BEGIN_CRITICAL_SECTION(op)                                  \
+    {                                                                   \
+        PyCriticalSection _py_cs;                                       \
+        PyCriticalSection_Begin(&_py_cs, _PyObject_CAST(op))
+
+# define Py_END_CRITICAL_SECTION()                                      \
+        PyCriticalSection_End(&_py_cs);                                 \
+    }
+
+# define Py_BEGIN_CRITICAL_SECTION2(a, b)                               \
+    {                                                                   \
+        PyCriticalSection2 _py_cs2;                                     \
+        PyCriticalSection2_Begin(&_py_cs2, _PyObject_CAST(a), _PyObject_CAST(b))
+
+# define Py_END_CRITICAL_SECTION2()                                     \
+        PyCriticalSection2_End(&_py_cs2);                               \
+    }
+
+#endif
diff --git a/Include/cpython/descrobject.h b/Include/cpython/descrobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbad8b59c225ab5a8e52493dcebd08ab7499e67e
--- /dev/null
+++ b/Include/cpython/descrobject.h
@@ -0,0 +1,62 @@
+#ifndef Py_CPYTHON_DESCROBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef PyObject *(*wrapperfunc)(PyObject *self, PyObject *args,
+                                 void *wrapped);
+
+typedef PyObject *(*wrapperfunc_kwds)(PyObject *self, PyObject *args,
+                                      void *wrapped, PyObject *kwds);
+
+struct wrapperbase {
+    const char *name;
+    int offset;
+    void *function;
+    wrapperfunc wrapper;
+    const char *doc;
+    int flags;
+    PyObject *name_strobj;
+};
+
+/* Flags for above struct */
+#define PyWrapperFlag_KEYWORDS 1 /* wrapper function takes keyword args */
+
+/* Various kinds of descriptor objects */
+
+typedef struct {
+    PyObject_HEAD
+    PyTypeObject *d_type;
+    PyObject *d_name;
+    PyObject *d_qualname;
+} PyDescrObject;
+
+#define PyDescr_COMMON PyDescrObject d_common
+
+#define PyDescr_TYPE(x) (((PyDescrObject *)(x))->d_type)
+#define PyDescr_NAME(x) (((PyDescrObject *)(x))->d_name)
+
+typedef struct {
+    PyDescr_COMMON;
+    PyMethodDef *d_method;
+    vectorcallfunc vectorcall;
+} PyMethodDescrObject;
+
+typedef struct {
+    PyDescr_COMMON;
+    PyMemberDef *d_member;
+} PyMemberDescrObject;
+
+typedef struct {
+    PyDescr_COMMON;
+    PyGetSetDef *d_getset;
+} PyGetSetDescrObject;
+
+typedef struct {
+    PyDescr_COMMON;
+    struct wrapperbase *d_base;
+    void *d_wrapped; /* This can be any function pointer */
+} PyWrapperDescrObject;
+
+PyAPI_FUNC(PyObject *) PyDescr_NewWrapper(PyTypeObject *,
+                                                struct wrapperbase *, void *);
+PyAPI_FUNC(int) PyDescr_IsData(PyObject *);
diff --git a/Include/cpython/dictobject.h b/Include/cpython/dictobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fd23b9313c45361aea981a7d6cae99ab8aa52c3
--- /dev/null
+++ b/Include/cpython/dictobject.h
@@ -0,0 +1,102 @@
+#ifndef Py_CPYTHON_DICTOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct _dictkeysobject PyDictKeysObject;
+typedef struct _dictvalues PyDictValues;
+
+/* The ma_values pointer is NULL for a combined table
+ * or points to an array of PyObject* for a split table
+ */
+typedef struct {
+    PyObject_HEAD
+
+    /* Number of items in the dictionary */
+    Py_ssize_t ma_used;
+
+    /* Dictionary version: globally unique, value change each time
+       the dictionary is modified */
+#ifdef Py_BUILD_CORE
+    /* Bits 0-7 are for dict watchers.
+     * Bits 8-11 are for the watched mutation counter (used by tier2 optimization)
+     * The remaining bits (12-63) are the actual version tag. */
+    uint64_t ma_version_tag;
+#else
+    Py_DEPRECATED(3.12) uint64_t ma_version_tag;
+#endif
+
+    PyDictKeysObject *ma_keys;
+
+    /* If ma_values is NULL, the table is "combined": keys and values
+       are stored in ma_keys.
+
+       If ma_values is not NULL, the table is split:
+       keys are stored in ma_keys and values are stored in ma_values */
+    PyDictValues *ma_values;
+} PyDictObject;
+
+PyAPI_FUNC(PyObject *) _PyDict_GetItem_KnownHash(PyObject *mp, PyObject *key,
+                                                 Py_hash_t hash);
+PyAPI_FUNC(PyObject *) _PyDict_GetItemStringWithError(PyObject *, const char *);
+PyAPI_FUNC(PyObject *) PyDict_SetDefault(
+    PyObject *mp, PyObject *key, PyObject *defaultobj);
+
+// Inserts `key` with a value `default_value`, if `key` is not already present
+// in the dictionary.  If `result` is not NULL, then the value associated
+// with `key` is returned in `*result` (either the existing value, or the now
+// inserted `default_value`).
+// Returns:
+//   -1 on error
+//    0 if `key` was not present and `default_value` was inserted
+//    1 if `key` was present and `default_value` was not inserted
+PyAPI_FUNC(int) PyDict_SetDefaultRef(PyObject *mp, PyObject *key, PyObject *default_value, PyObject **result);
+
+/* Get the number of items of a dictionary. */
+static inline Py_ssize_t PyDict_GET_SIZE(PyObject *op) {
+    PyDictObject *mp;
+    assert(PyDict_Check(op));
+    mp = _Py_CAST(PyDictObject*, op);
+#ifdef Py_GIL_DISABLED
+    return _Py_atomic_load_ssize_relaxed(&mp->ma_used);
+#else
+    return mp->ma_used;
+#endif
+}
+#define PyDict_GET_SIZE(op) PyDict_GET_SIZE(_PyObject_CAST(op))
+
+PyAPI_FUNC(int) PyDict_ContainsString(PyObject *mp, const char *key);
+
+PyAPI_FUNC(PyObject *) _PyDict_NewPresized(Py_ssize_t minused);
+
+PyAPI_FUNC(int) PyDict_Pop(PyObject *dict, PyObject *key, PyObject **result);
+PyAPI_FUNC(int) PyDict_PopString(PyObject *dict, const char *key, PyObject **result);
+PyAPI_FUNC(PyObject *) _PyDict_Pop(PyObject *dict, PyObject *key, PyObject *default_value);
+
+/* Dictionary watchers */
+
+#define PY_FOREACH_DICT_EVENT(V) \
+    V(ADDED)                     \
+    V(MODIFIED)                  \
+    V(DELETED)                   \
+    V(CLONED)                    \
+    V(CLEARED)                   \
+    V(DEALLOCATED)
+
+typedef enum {
+    #define PY_DEF_EVENT(EVENT) PyDict_EVENT_##EVENT,
+    PY_FOREACH_DICT_EVENT(PY_DEF_EVENT)
+    #undef PY_DEF_EVENT
+} PyDict_WatchEvent;
+
+// Callback to be invoked when a watched dict is cleared, dealloced, or modified.
+// In clear/dealloc case, key and new_value will be NULL. Otherwise, new_value will be the
+// new value for key, NULL if key is being deleted.
+typedef int(*PyDict_WatchCallback)(PyDict_WatchEvent event, PyObject* dict, PyObject* key, PyObject* new_value);
+
+// Register/unregister a dict-watcher callback
+PyAPI_FUNC(int) PyDict_AddWatcher(PyDict_WatchCallback callback);
+PyAPI_FUNC(int) PyDict_ClearWatcher(int watcher_id);
+
+// Mark given dictionary as "watched" (callback will be called if it is modified)
+PyAPI_FUNC(int) PyDict_Watch(int watcher_id, PyObject* dict);
+PyAPI_FUNC(int) PyDict_Unwatch(int watcher_id, PyObject* dict);
diff --git a/Include/cpython/fileobject.h b/Include/cpython/fileobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2d89c522bdd1326b440421c76f4cc0f77d6ba87
--- /dev/null
+++ b/Include/cpython/fileobject.h
@@ -0,0 +1,16 @@
+#ifndef Py_CPYTHON_FILEOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(char *) Py_UniversalNewlineFgets(char *, int, FILE*, PyObject *);
+
+/* The std printer acts as a preliminary sys.stderr until the new io
+   infrastructure is in place. */
+PyAPI_FUNC(PyObject *) PyFile_NewStdPrinter(int);
+PyAPI_DATA(PyTypeObject) PyStdPrinter_Type;
+
+typedef PyObject * (*Py_OpenCodeHookFunction)(PyObject *, void *);
+
+PyAPI_FUNC(PyObject *) PyFile_OpenCode(const char *utf8path);
+PyAPI_FUNC(PyObject *) PyFile_OpenCodeObject(PyObject *path);
+PyAPI_FUNC(int) PyFile_SetOpenCodeHook(Py_OpenCodeHookFunction hook, void *userData);
diff --git a/Include/cpython/fileutils.h b/Include/cpython/fileutils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b386ad107bde1ff9f522137167c10868766d2f30
--- /dev/null
+++ b/Include/cpython/fileutils.h
@@ -0,0 +1,8 @@
+#ifndef Py_CPYTHON_FILEUTILS_H
+#  error "this header file must not be included directly"
+#endif
+
+// Used by _testcapi which must not use the internal C API
+PyAPI_FUNC(FILE*) _Py_fopen_obj(
+    PyObject *path,
+    const char *mode);
diff --git a/Include/cpython/floatobject.h b/Include/cpython/floatobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..127093098bfe642355383c0be1ba6e560b1f93cd
--- /dev/null
+++ b/Include/cpython/floatobject.h
@@ -0,0 +1,27 @@
+#ifndef Py_CPYTHON_FLOATOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    double ob_fval;
+} PyFloatObject;
+
+#define _PyFloat_CAST(op) \
+    (assert(PyFloat_Check(op)), _Py_CAST(PyFloatObject*, op))
+
+// Static inline version of PyFloat_AsDouble() trading safety for speed.
+// It doesn't check if op is a double object.
+static inline double PyFloat_AS_DOUBLE(PyObject *op) {
+    return _PyFloat_CAST(op)->ob_fval;
+}
+#define PyFloat_AS_DOUBLE(op) PyFloat_AS_DOUBLE(_PyObject_CAST(op))
+
+
+PyAPI_FUNC(int) PyFloat_Pack2(double x, char *p, int le);
+PyAPI_FUNC(int) PyFloat_Pack4(double x, char *p, int le);
+PyAPI_FUNC(int) PyFloat_Pack8(double x, char *p, int le);
+
+PyAPI_FUNC(double) PyFloat_Unpack2(const char *p, int le);
+PyAPI_FUNC(double) PyFloat_Unpack4(const char *p, int le);
+PyAPI_FUNC(double) PyFloat_Unpack8(const char *p, int le);
diff --git a/Include/cpython/frameobject.h b/Include/cpython/frameobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbbfbb5105ba7aa51840beff4a90b4e103d497da
--- /dev/null
+++ b/Include/cpython/frameobject.h
@@ -0,0 +1,35 @@
+/* Frame object interface */
+
+#ifndef Py_CPYTHON_FRAMEOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Standard object interface */
+
+PyAPI_FUNC(PyFrameObject *) PyFrame_New(PyThreadState *, PyCodeObject *,
+                                        PyObject *, PyObject *);
+
+/* The rest of the interface is specific for frame objects */
+
+/* Conversions between "fast locals" and locals in dictionary */
+
+PyAPI_FUNC(void) PyFrame_LocalsToFast(PyFrameObject *, int);
+
+/* -- Caveat emptor --
+ * The concept of entry frames is an implementation detail of the CPython
+ * interpreter. This API is considered unstable and is provided for the
+ * convenience of debuggers, profilers and state-inspecting tools. Notice that
+ * this API can be changed in future minor versions if the underlying frame
+ * mechanism change or the concept of an 'entry frame' or its semantics becomes
+ * obsolete or outdated. */
+
+PyAPI_FUNC(int) _PyFrame_IsEntryFrame(PyFrameObject *frame);
+
+PyAPI_FUNC(int) PyFrame_FastToLocalsWithError(PyFrameObject *f);
+PyAPI_FUNC(void) PyFrame_FastToLocals(PyFrameObject *);
+
+
+typedef struct {
+    PyObject_HEAD
+    PyFrameObject* frame;
+} PyFrameLocalsProxyObject;
diff --git a/Include/cpython/funcobject.h b/Include/cpython/funcobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..5433ba48eefc69220e7e031a164f3095644100ae
--- /dev/null
+++ b/Include/cpython/funcobject.h
@@ -0,0 +1,184 @@
+/* Function object interface */
+
+#ifndef Py_LIMITED_API
+#ifndef Py_FUNCOBJECT_H
+#define Py_FUNCOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define _Py_COMMON_FIELDS(PREFIX) \
+    PyObject *PREFIX ## globals; \
+    PyObject *PREFIX ## builtins; \
+    PyObject *PREFIX ## name; \
+    PyObject *PREFIX ## qualname; \
+    PyObject *PREFIX ## code;        /* A code object, the __code__ attribute */ \
+    PyObject *PREFIX ## defaults;    /* NULL or a tuple */ \
+    PyObject *PREFIX ## kwdefaults;  /* NULL or a dict */ \
+    PyObject *PREFIX ## closure;     /* NULL or a tuple of cell objects */
+
+typedef struct {
+    _Py_COMMON_FIELDS(fc_)
+} PyFrameConstructor;
+
+/* Function objects and code objects should not be confused with each other:
+ *
+ * Function objects are created by the execution of the 'def' statement.
+ * They reference a code object in their __code__ attribute, which is a
+ * purely syntactic object, i.e. nothing more than a compiled version of some
+ * source code lines.  There is one code object per source code "fragment",
+ * but each code object can be referenced by zero or many function objects
+ * depending only on how many times the 'def' statement in the source was
+ * executed so far.
+ */
+
+typedef struct {
+    PyObject_HEAD
+    _Py_COMMON_FIELDS(func_)
+    PyObject *func_doc;         /* The __doc__ attribute, can be anything */
+    PyObject *func_dict;        /* The __dict__ attribute, a dict or NULL */
+    PyObject *func_weakreflist; /* List of weak references */
+    PyObject *func_module;      /* The __module__ attribute, can be anything */
+    PyObject *func_annotations; /* Annotations, a dict or NULL */
+    PyObject *func_typeparams;  /* Tuple of active type variables or NULL */
+    vectorcallfunc vectorcall;
+    /* Version number for use by specializer.
+     * Can set to non-zero when we want to specialize.
+     * Will be set to zero if any of these change:
+     *     defaults
+     *     kwdefaults (only if the object changes, not the contents of the dict)
+     *     code
+     *     annotations
+     *     vectorcall function pointer */
+    uint32_t func_version;
+
+    /* Invariant:
+     *     func_closure contains the bindings for func_code->co_freevars, so
+     *     PyTuple_Size(func_closure) == PyCode_GetNumFree(func_code)
+     *     (func_closure may be NULL if PyCode_GetNumFree(func_code) == 0).
+     */
+} PyFunctionObject;
+
+#undef _Py_COMMON_FIELDS
+
+PyAPI_DATA(PyTypeObject) PyFunction_Type;
+
+#define PyFunction_Check(op) Py_IS_TYPE((op), &PyFunction_Type)
+
+PyAPI_FUNC(PyObject *) PyFunction_New(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_NewWithQualName(PyObject *, PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetCode(PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetGlobals(PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetModule(PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetDefaults(PyObject *);
+PyAPI_FUNC(int) PyFunction_SetDefaults(PyObject *, PyObject *);
+PyAPI_FUNC(void) PyFunction_SetVectorcall(PyFunctionObject *, vectorcallfunc);
+PyAPI_FUNC(PyObject *) PyFunction_GetKwDefaults(PyObject *);
+PyAPI_FUNC(int) PyFunction_SetKwDefaults(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetClosure(PyObject *);
+PyAPI_FUNC(int) PyFunction_SetClosure(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetAnnotations(PyObject *);
+PyAPI_FUNC(int) PyFunction_SetAnnotations(PyObject *, PyObject *);
+
+#define _PyFunction_CAST(func) \
+    (assert(PyFunction_Check(func)), _Py_CAST(PyFunctionObject*, func))
+
+/* Static inline functions for direct access to these values.
+   Type checks are *not* done, so use with care. */
+static inline PyObject* PyFunction_GET_CODE(PyObject *func) {
+    return _PyFunction_CAST(func)->func_code;
+}
+#define PyFunction_GET_CODE(func) PyFunction_GET_CODE(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_GLOBALS(PyObject *func) {
+    return _PyFunction_CAST(func)->func_globals;
+}
+#define PyFunction_GET_GLOBALS(func) PyFunction_GET_GLOBALS(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_MODULE(PyObject *func) {
+    return _PyFunction_CAST(func)->func_module;
+}
+#define PyFunction_GET_MODULE(func) PyFunction_GET_MODULE(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_DEFAULTS(PyObject *func) {
+    return _PyFunction_CAST(func)->func_defaults;
+}
+#define PyFunction_GET_DEFAULTS(func) PyFunction_GET_DEFAULTS(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_KW_DEFAULTS(PyObject *func) {
+    return _PyFunction_CAST(func)->func_kwdefaults;
+}
+#define PyFunction_GET_KW_DEFAULTS(func) PyFunction_GET_KW_DEFAULTS(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_CLOSURE(PyObject *func) {
+    return _PyFunction_CAST(func)->func_closure;
+}
+#define PyFunction_GET_CLOSURE(func) PyFunction_GET_CLOSURE(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_ANNOTATIONS(PyObject *func) {
+    return _PyFunction_CAST(func)->func_annotations;
+}
+#define PyFunction_GET_ANNOTATIONS(func) PyFunction_GET_ANNOTATIONS(_PyObject_CAST(func))
+
+/* The classmethod and staticmethod types lives here, too */
+PyAPI_DATA(PyTypeObject) PyClassMethod_Type;
+PyAPI_DATA(PyTypeObject) PyStaticMethod_Type;
+
+PyAPI_FUNC(PyObject *) PyClassMethod_New(PyObject *);
+PyAPI_FUNC(PyObject *) PyStaticMethod_New(PyObject *);
+
+#define PY_FOREACH_FUNC_EVENT(V) \
+    V(CREATE)                    \
+    V(DESTROY)                   \
+    V(MODIFY_CODE)               \
+    V(MODIFY_DEFAULTS)           \
+    V(MODIFY_KWDEFAULTS)
+
+typedef enum {
+    #define PY_DEF_EVENT(EVENT) PyFunction_EVENT_##EVENT,
+    PY_FOREACH_FUNC_EVENT(PY_DEF_EVENT)
+    #undef PY_DEF_EVENT
+} PyFunction_WatchEvent;
+
+/*
+ * A callback that is invoked for different events in a function's lifecycle.
+ *
+ * The callback is invoked with a borrowed reference to func, after it is
+ * created and before it is modified or destroyed. The callback should not
+ * modify func.
+ *
+ * When a function's code object, defaults, or kwdefaults are modified the
+ * callback will be invoked with the respective event and new_value will
+ * contain a borrowed reference to the new value that is about to be stored in
+ * the function. Otherwise the third argument is NULL.
+ *
+ * If the callback returns with an exception set, it must return -1. Otherwise
+ * it should return 0.
+ */
+typedef int (*PyFunction_WatchCallback)(
+  PyFunction_WatchEvent event,
+  PyFunctionObject *func,
+  PyObject *new_value);
+
+/*
+ * Register a per-interpreter callback that will be invoked for function lifecycle
+ * events.
+ *
+ * Returns a handle that may be passed to PyFunction_ClearWatcher on success,
+ * or -1 and sets an error if no more handles are available.
+ */
+PyAPI_FUNC(int) PyFunction_AddWatcher(PyFunction_WatchCallback callback);
+
+/*
+ * Clear the watcher associated with the watcher_id handle.
+ *
+ * Returns 0 on success or -1 if no watcher exists for the supplied id.
+ */
+PyAPI_FUNC(int) PyFunction_ClearWatcher(int watcher_id);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_FUNCOBJECT_H */
+#endif /* Py_LIMITED_API */
diff --git a/Include/cpython/genobject.h b/Include/cpython/genobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..49e46c277d75ae8bc96580c9adeb96e41dae2ae8
--- /dev/null
+++ b/Include/cpython/genobject.h
@@ -0,0 +1,83 @@
+/* Generator object interface */
+
+#ifndef Py_LIMITED_API
+#ifndef Py_GENOBJECT_H
+#define Py_GENOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* --- Generators --------------------------------------------------------- */
+
+/* _PyGenObject_HEAD defines the initial segment of generator
+   and coroutine objects. */
+#define _PyGenObject_HEAD(prefix)                                           \
+    PyObject_HEAD                                                           \
+    /* List of weak reference. */                                           \
+    PyObject *prefix##_weakreflist;                                         \
+    /* Name of the generator. */                                            \
+    PyObject *prefix##_name;                                                \
+    /* Qualified name of the generator. */                                  \
+    PyObject *prefix##_qualname;                                            \
+    _PyErr_StackItem prefix##_exc_state;                                    \
+    PyObject *prefix##_origin_or_finalizer;                                 \
+    char prefix##_hooks_inited;                                             \
+    char prefix##_closed;                                                   \
+    char prefix##_running_async;                                            \
+    /* The frame */                                                         \
+    int8_t prefix##_frame_state;                                            \
+    PyObject *prefix##_iframe[1];                                           \
+
+typedef struct {
+    /* The gi_ prefix is intended to remind of generator-iterator. */
+    _PyGenObject_HEAD(gi)
+} PyGenObject;
+
+PyAPI_DATA(PyTypeObject) PyGen_Type;
+
+#define PyGen_Check(op) PyObject_TypeCheck((op), &PyGen_Type)
+#define PyGen_CheckExact(op) Py_IS_TYPE((op), &PyGen_Type)
+
+PyAPI_FUNC(PyObject *) PyGen_New(PyFrameObject *);
+PyAPI_FUNC(PyObject *) PyGen_NewWithQualName(PyFrameObject *,
+    PyObject *name, PyObject *qualname);
+PyAPI_FUNC(PyCodeObject *) PyGen_GetCode(PyGenObject *gen);
+
+
+/* --- PyCoroObject ------------------------------------------------------- */
+
+typedef struct {
+    _PyGenObject_HEAD(cr)
+} PyCoroObject;
+
+PyAPI_DATA(PyTypeObject) PyCoro_Type;
+
+#define PyCoro_CheckExact(op) Py_IS_TYPE((op), &PyCoro_Type)
+PyAPI_FUNC(PyObject *) PyCoro_New(PyFrameObject *,
+    PyObject *name, PyObject *qualname);
+
+
+/* --- Asynchronous Generators -------------------------------------------- */
+
+typedef struct {
+    _PyGenObject_HEAD(ag)
+} PyAsyncGenObject;
+
+PyAPI_DATA(PyTypeObject) PyAsyncGen_Type;
+PyAPI_DATA(PyTypeObject) _PyAsyncGenASend_Type;
+
+PyAPI_FUNC(PyObject *) PyAsyncGen_New(PyFrameObject *,
+    PyObject *name, PyObject *qualname);
+
+#define PyAsyncGen_CheckExact(op) Py_IS_TYPE((op), &PyAsyncGen_Type)
+
+#define PyAsyncGenASend_CheckExact(op) Py_IS_TYPE((op), &_PyAsyncGenASend_Type)
+
+
+#undef _PyGenObject_HEAD
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_GENOBJECT_H */
+#endif /* Py_LIMITED_API */
diff --git a/Include/cpython/import.h b/Include/cpython/import.h
new file mode 100644
index 0000000000000000000000000000000000000000..7daf0b84fcf71bf172db2f7b60ff0e260053da73
--- /dev/null
+++ b/Include/cpython/import.h
@@ -0,0 +1,25 @@
+#ifndef Py_CPYTHON_IMPORT_H
+#  error "this header file must not be included directly"
+#endif
+
+PyMODINIT_FUNC PyInit__imp(void);
+
+struct _inittab {
+    const char *name;           /* ASCII encoded string */
+    PyObject* (*initfunc)(void);
+};
+// This is not used after Py_Initialize() is called.
+PyAPI_DATA(struct _inittab *) PyImport_Inittab;
+PyAPI_FUNC(int) PyImport_ExtendInittab(struct _inittab *newtab);
+
+struct _frozen {
+    const char *name;                 /* ASCII encoded string */
+    const unsigned char *code;
+    int size;
+    int is_package;
+};
+
+/* Embedding apps may change this pointer to point to their favorite
+   collection of frozen modules: */
+
+PyAPI_DATA(const struct _frozen *) PyImport_FrozenModules;
diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..5da5ef9e5431b1f79bb5134a35f79ce95d90fea6
--- /dev/null
+++ b/Include/cpython/initconfig.h
@@ -0,0 +1,274 @@
+#ifndef Py_PYCORECONFIG_H
+#define Py_PYCORECONFIG_H
+#ifndef Py_LIMITED_API
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* --- PyStatus ----------------------------------------------- */
+
+typedef struct {
+    enum {
+        _PyStatus_TYPE_OK=0,
+        _PyStatus_TYPE_ERROR=1,
+        _PyStatus_TYPE_EXIT=2
+    } _type;
+    const char *func;
+    const char *err_msg;
+    int exitcode;
+} PyStatus;
+
+PyAPI_FUNC(PyStatus) PyStatus_Ok(void);
+PyAPI_FUNC(PyStatus) PyStatus_Error(const char *err_msg);
+PyAPI_FUNC(PyStatus) PyStatus_NoMemory(void);
+PyAPI_FUNC(PyStatus) PyStatus_Exit(int exitcode);
+PyAPI_FUNC(int) PyStatus_IsError(PyStatus err);
+PyAPI_FUNC(int) PyStatus_IsExit(PyStatus err);
+PyAPI_FUNC(int) PyStatus_Exception(PyStatus err);
+
+/* --- PyWideStringList ------------------------------------------------ */
+
+typedef struct {
+    /* If length is greater than zero, items must be non-NULL
+       and all items strings must be non-NULL */
+    Py_ssize_t length;
+    wchar_t **items;
+} PyWideStringList;
+
+PyAPI_FUNC(PyStatus) PyWideStringList_Append(PyWideStringList *list,
+    const wchar_t *item);
+PyAPI_FUNC(PyStatus) PyWideStringList_Insert(PyWideStringList *list,
+    Py_ssize_t index,
+    const wchar_t *item);
+
+
+/* --- PyPreConfig ----------------------------------------------- */
+
+typedef struct PyPreConfig {
+    int _config_init;     /* _PyConfigInitEnum value */
+
+    /* Parse Py_PreInitializeFromBytesArgs() arguments?
+       See PyConfig.parse_argv */
+    int parse_argv;
+
+    /* If greater than 0, enable isolated mode: sys.path contains
+       neither the script's directory nor the user's site-packages directory.
+
+       Set to 1 by the -I command line option. If set to -1 (default), inherit
+       Py_IsolatedFlag value. */
+    int isolated;
+
+    /* If greater than 0: use environment variables.
+       Set to 0 by -E command line option. If set to -1 (default), it is
+       set to !Py_IgnoreEnvironmentFlag. */
+    int use_environment;
+
+    /* Set the LC_CTYPE locale to the user preferred locale? If equals to 0,
+       set coerce_c_locale and coerce_c_locale_warn to 0. */
+    int configure_locale;
+
+    /* Coerce the LC_CTYPE locale if it's equal to "C"? (PEP 538)
+
+       Set to 0 by PYTHONCOERCECLOCALE=0. Set to 1 by PYTHONCOERCECLOCALE=1.
+       Set to 2 if the user preferred LC_CTYPE locale is "C".
+
+       If it is equal to 1, LC_CTYPE locale is read to decide if it should be
+       coerced or not (ex: PYTHONCOERCECLOCALE=1). Internally, it is set to 2
+       if the LC_CTYPE locale must be coerced.
+
+       Disable by default (set to 0). Set it to -1 to let Python decide if it
+       should be enabled or not. */
+    int coerce_c_locale;
+
+    /* Emit a warning if the LC_CTYPE locale is coerced?
+
+       Set to 1 by PYTHONCOERCECLOCALE=warn.
+
+       Disable by default (set to 0). Set it to -1 to let Python decide if it
+       should be enabled or not. */
+    int coerce_c_locale_warn;
+
+#ifdef MS_WINDOWS
+    /* If greater than 1, use the "mbcs" encoding instead of the UTF-8
+       encoding for the filesystem encoding.
+
+       Set to 1 if the PYTHONLEGACYWINDOWSFSENCODING environment variable is
+       set to a non-empty string. If set to -1 (default), inherit
+       Py_LegacyWindowsFSEncodingFlag value.
+
+       See PEP 529 for more details. */
+    int legacy_windows_fs_encoding;
+#endif
+
+    /* Enable UTF-8 mode? (PEP 540)
+
+       Disabled by default (equals to 0).
+
+       Set to 1 by "-X utf8" and "-X utf8=1" command line options.
+       Set to 1 by PYTHONUTF8=1 environment variable.
+
+       Set to 0 by "-X utf8=0" and PYTHONUTF8=0.
+
+       If equals to -1, it is set to 1 if the LC_CTYPE locale is "C" or
+       "POSIX", otherwise it is set to 0. Inherit Py_UTF8Mode value value. */
+    int utf8_mode;
+
+    /* If non-zero, enable the Python Development Mode.
+
+       Set to 1 by the -X dev command line option. Set by the PYTHONDEVMODE
+       environment variable. */
+    int dev_mode;
+
+    /* Memory allocator: PYTHONMALLOC env var.
+       See PyMemAllocatorName for valid values. */
+    int allocator;
+} PyPreConfig;
+
+PyAPI_FUNC(void) PyPreConfig_InitPythonConfig(PyPreConfig *config);
+PyAPI_FUNC(void) PyPreConfig_InitIsolatedConfig(PyPreConfig *config);
+
+
+/* --- PyConfig ---------------------------------------------- */
+
+/* This structure is best documented in the Doc/c-api/init_config.rst file. */
+typedef struct PyConfig {
+    int _config_init;     /* _PyConfigInitEnum value */
+
+    int isolated;
+    int use_environment;
+    int dev_mode;
+    int install_signal_handlers;
+    int use_hash_seed;
+    unsigned long hash_seed;
+    int faulthandler;
+    int tracemalloc;
+    int perf_profiling;
+    int import_time;
+    int code_debug_ranges;
+    int show_ref_count;
+    int dump_refs;
+    wchar_t *dump_refs_file;
+    int malloc_stats;
+    wchar_t *filesystem_encoding;
+    wchar_t *filesystem_errors;
+    wchar_t *pycache_prefix;
+    int parse_argv;
+    PyWideStringList orig_argv;
+    PyWideStringList argv;
+    PyWideStringList xoptions;
+    PyWideStringList warnoptions;
+    int site_import;
+    int bytes_warning;
+    int warn_default_encoding;
+    int inspect;
+    int interactive;
+    int optimization_level;
+    int parser_debug;
+    int write_bytecode;
+    int verbose;
+    int quiet;
+    int user_site_directory;
+    int configure_c_stdio;
+    int buffered_stdio;
+    wchar_t *stdio_encoding;
+    wchar_t *stdio_errors;
+#ifdef MS_WINDOWS
+    int legacy_windows_stdio;
+#endif
+    wchar_t *check_hash_pycs_mode;
+    int use_frozen_modules;
+    int safe_path;
+    int int_max_str_digits;
+
+    int cpu_count;
+#ifdef Py_GIL_DISABLED
+    int enable_gil;
+#endif
+
+    /* --- Path configuration inputs ------------ */
+    int pathconfig_warnings;
+    wchar_t *program_name;
+    wchar_t *pythonpath_env;
+    wchar_t *home;
+    wchar_t *platlibdir;
+
+    /* --- Path configuration outputs ----------- */
+    int module_search_paths_set;
+    PyWideStringList module_search_paths;
+    wchar_t *stdlib_dir;
+    wchar_t *executable;
+    wchar_t *base_executable;
+    wchar_t *prefix;
+    wchar_t *base_prefix;
+    wchar_t *exec_prefix;
+    wchar_t *base_exec_prefix;
+
+    /* --- Parameter only used by Py_Main() ---------- */
+    int skip_source_first_line;
+    wchar_t *run_command;
+    wchar_t *run_module;
+    wchar_t *run_filename;
+
+    /* --- Set by Py_Main() -------------------------- */
+    wchar_t *sys_path_0;
+
+    /* --- Private fields ---------------------------- */
+
+    // Install importlib? If equals to 0, importlib is not initialized at all.
+    // Needed by freeze_importlib.
+    int _install_importlib;
+
+    // If equal to 0, stop Python initialization before the "main" phase.
+    int _init_main;
+
+    // If non-zero, we believe we're running from a source tree.
+    int _is_python_build;
+
+#ifdef Py_STATS
+    // If non-zero, turns on statistics gathering.
+    int _pystats;
+#endif
+
+#ifdef Py_DEBUG
+    // If not empty, import a non-__main__ module before site.py is executed.
+    // PYTHON_PRESITE=package.module or -X presite=package.module
+    wchar_t *run_presite;
+#endif
+} PyConfig;
+
+PyAPI_FUNC(void) PyConfig_InitPythonConfig(PyConfig *config);
+PyAPI_FUNC(void) PyConfig_InitIsolatedConfig(PyConfig *config);
+PyAPI_FUNC(void) PyConfig_Clear(PyConfig *);
+PyAPI_FUNC(PyStatus) PyConfig_SetString(
+    PyConfig *config,
+    wchar_t **config_str,
+    const wchar_t *str);
+PyAPI_FUNC(PyStatus) PyConfig_SetBytesString(
+    PyConfig *config,
+    wchar_t **config_str,
+    const char *str);
+PyAPI_FUNC(PyStatus) PyConfig_Read(PyConfig *config);
+PyAPI_FUNC(PyStatus) PyConfig_SetBytesArgv(
+    PyConfig *config,
+    Py_ssize_t argc,
+    char * const *argv);
+PyAPI_FUNC(PyStatus) PyConfig_SetArgv(PyConfig *config,
+    Py_ssize_t argc,
+    wchar_t * const *argv);
+PyAPI_FUNC(PyStatus) PyConfig_SetWideStringList(PyConfig *config,
+    PyWideStringList *list,
+    Py_ssize_t length, wchar_t **items);
+
+
+/* --- Helper functions --------------------------------------- */
+
+/* Get the original command line arguments, before Python modified them.
+
+   See also PyConfig.orig_argv. */
+PyAPI_FUNC(void) Py_GetArgcArgv(int *argc, wchar_t ***argv);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_LIMITED_API */
+#endif /* !Py_PYCORECONFIG_H */
diff --git a/Include/cpython/listobject.h b/Include/cpython/listobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..49f5e8d6d1a0d6c2863925daaac16c07b6066948
--- /dev/null
+++ b/Include/cpython/listobject.h
@@ -0,0 +1,53 @@
+#ifndef Py_CPYTHON_LISTOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct {
+    PyObject_VAR_HEAD
+    /* Vector of pointers to list elements.  list[0] is ob_item[0], etc. */
+    PyObject **ob_item;
+
+    /* ob_item contains space for 'allocated' elements.  The number
+     * currently in use is ob_size.
+     * Invariants:
+     *     0 <= ob_size <= allocated
+     *     len(list) == ob_size
+     *     ob_item == NULL implies ob_size == allocated == 0
+     * list.sort() temporarily sets allocated to -1 to detect mutations.
+     *
+     * Items must normally not be NULL, except during construction when
+     * the list is not yet visible outside the function that builds it.
+     */
+    Py_ssize_t allocated;
+} PyListObject;
+
+/* Cast argument to PyListObject* type. */
+#define _PyList_CAST(op) \
+    (assert(PyList_Check(op)), _Py_CAST(PyListObject*, (op)))
+
+// Macros and static inline functions, trading safety for speed
+
+static inline Py_ssize_t PyList_GET_SIZE(PyObject *op) {
+    PyListObject *list = _PyList_CAST(op);
+#ifdef Py_GIL_DISABLED
+    return _Py_atomic_load_ssize_relaxed(&(_PyVarObject_CAST(list)->ob_size));
+#else
+    return Py_SIZE(list);
+#endif
+}
+#define PyList_GET_SIZE(op) PyList_GET_SIZE(_PyObject_CAST(op))
+
+#define PyList_GET_ITEM(op, index) (_PyList_CAST(op)->ob_item[(index)])
+
+static inline void
+PyList_SET_ITEM(PyObject *op, Py_ssize_t index, PyObject *value) {
+    PyListObject *list = _PyList_CAST(op);
+    assert(0 <= index);
+    assert(index < list->allocated);
+    list->ob_item[index] = value;
+}
+#define PyList_SET_ITEM(op, index, value) \
+    PyList_SET_ITEM(_PyObject_CAST(op), (index), _PyObject_CAST(value))
+
+PyAPI_FUNC(int) PyList_Extend(PyObject *self, PyObject *iterable);
+PyAPI_FUNC(int) PyList_Clear(PyObject *self);
diff --git a/Include/cpython/lock.h b/Include/cpython/lock.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ee03e82f74dfdc747a6af65b3baf7f3095792b7
--- /dev/null
+++ b/Include/cpython/lock.h
@@ -0,0 +1,63 @@
+#ifndef Py_CPYTHON_LOCK_H
+#  error "this header file must not be included directly"
+#endif
+
+#define _Py_UNLOCKED    0
+#define _Py_LOCKED      1
+
+// A mutex that occupies one byte. The lock can be zero initialized to
+// represent the unlocked state.
+//
+// Typical initialization:
+//   PyMutex m = (PyMutex){0};
+//
+// Or initialize as global variables:
+//   static PyMutex m;
+//
+// Typical usage:
+//   PyMutex_Lock(&m);
+//   ...
+//   PyMutex_Unlock(&m);
+//
+// The contents of the PyMutex are not part of the public API, but are
+// described to aid in understanding the implementation and debugging. Only
+// the two least significant bits are used. The remaining bits are always zero:
+// 0b00: unlocked
+// 0b01: locked
+// 0b10: unlocked and has parked threads
+// 0b11: locked and has parked threads
+typedef struct PyMutex {
+    uint8_t _bits;  // (private)
+} PyMutex;
+
+// exported function for locking the mutex
+PyAPI_FUNC(void) PyMutex_Lock(PyMutex *m);
+
+// exported function for unlocking the mutex
+PyAPI_FUNC(void) PyMutex_Unlock(PyMutex *m);
+
+// Locks the mutex.
+//
+// If the mutex is currently locked, the calling thread will be parked until
+// the mutex is unlocked. If the current thread holds the GIL, then the GIL
+// will be released while the thread is parked.
+static inline void
+_PyMutex_Lock(PyMutex *m)
+{
+    uint8_t expected = _Py_UNLOCKED;
+    if (!_Py_atomic_compare_exchange_uint8(&m->_bits, &expected, _Py_LOCKED)) {
+        PyMutex_Lock(m);
+    }
+}
+#define PyMutex_Lock _PyMutex_Lock
+
+// Unlocks the mutex.
+static inline void
+_PyMutex_Unlock(PyMutex *m)
+{
+    uint8_t expected = _Py_LOCKED;
+    if (!_Py_atomic_compare_exchange_uint8(&m->_bits, &expected, _Py_UNLOCKED)) {
+        PyMutex_Unlock(m);
+    }
+}
+#define PyMutex_Unlock _PyMutex_Unlock
diff --git a/Include/cpython/longintrepr.h b/Include/cpython/longintrepr.h
new file mode 100644
index 0000000000000000000000000000000000000000..3246908ba982e217d1b3987dbae9884f075c35ad
--- /dev/null
+++ b/Include/cpython/longintrepr.h
@@ -0,0 +1,146 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_LONGINTREPR_H
+#define Py_LONGINTREPR_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* This is published for the benefit of "friends" marshal.c and _decimal.c. */
+
+/* Parameters of the integer representation.  There are two different
+   sets of parameters: one set for 30-bit digits, stored in an unsigned 32-bit
+   integer type, and one set for 15-bit digits with each digit stored in an
+   unsigned short.  The value of PYLONG_BITS_IN_DIGIT, defined either at
+   configure time or in pyport.h, is used to decide which digit size to use.
+
+   Type 'digit' should be able to hold 2*PyLong_BASE-1, and type 'twodigits'
+   should be an unsigned integer type able to hold all integers up to
+   PyLong_BASE*PyLong_BASE-1.  x_sub assumes that 'digit' is an unsigned type,
+   and that overflow is handled by taking the result modulo 2**N for some N >
+   PyLong_SHIFT.  The majority of the code doesn't care about the precise
+   value of PyLong_SHIFT, but there are some notable exceptions:
+
+   - PyLong_{As,From}ByteArray require that PyLong_SHIFT be at least 8
+
+   - long_hash() requires that PyLong_SHIFT is *strictly* less than the number
+     of bits in an unsigned long, as do the PyLong <-> long (or unsigned long)
+     conversion functions
+
+   - the Python int <-> size_t/Py_ssize_t conversion functions expect that
+     PyLong_SHIFT is strictly less than the number of bits in a size_t
+
+   - the marshal code currently expects that PyLong_SHIFT is a multiple of 15
+
+   - NSMALLNEGINTS and NSMALLPOSINTS should be small enough to fit in a single
+     digit; with the current values this forces PyLong_SHIFT >= 9
+
+  The values 15 and 30 should fit all of the above requirements, on any
+  platform.
+*/
+
+#if PYLONG_BITS_IN_DIGIT == 30
+typedef uint32_t digit;
+typedef int32_t sdigit; /* signed variant of digit */
+typedef uint64_t twodigits;
+typedef int64_t stwodigits; /* signed variant of twodigits */
+#define PyLong_SHIFT    30
+#define _PyLong_DECIMAL_SHIFT   9 /* max(e such that 10**e fits in a digit) */
+#define _PyLong_DECIMAL_BASE    ((digit)1000000000) /* 10 ** DECIMAL_SHIFT */
+#elif PYLONG_BITS_IN_DIGIT == 15
+typedef unsigned short digit;
+typedef short sdigit; /* signed variant of digit */
+typedef unsigned long twodigits;
+typedef long stwodigits; /* signed variant of twodigits */
+#define PyLong_SHIFT    15
+#define _PyLong_DECIMAL_SHIFT   4 /* max(e such that 10**e fits in a digit) */
+#define _PyLong_DECIMAL_BASE    ((digit)10000) /* 10 ** DECIMAL_SHIFT */
+#else
+#error "PYLONG_BITS_IN_DIGIT should be 15 or 30"
+#endif
+#define PyLong_BASE     ((digit)1 << PyLong_SHIFT)
+#define PyLong_MASK     ((digit)(PyLong_BASE - 1))
+
+/* Long integer representation.
+
+   Long integers are made up of a number of 30- or 15-bit digits, depending on
+   the platform. The number of digits (ndigits) is stored in the high bits of
+   the lv_tag field (lvtag >> _PyLong_NON_SIZE_BITS).
+
+   The absolute value of a number is equal to
+        SUM(for i=0 through ndigits-1) ob_digit[i] * 2**(PyLong_SHIFT*i)
+
+   The sign of the value is stored in the lower 2 bits of lv_tag.
+
+    - 0: Positive
+    - 1: Zero
+    - 2: Negative
+
+   The third lowest bit of lv_tag is reserved for an immortality flag, but is
+   not currently used.
+
+   In a normalized number, ob_digit[ndigits-1] (the most significant
+   digit) is never zero.  Also, in all cases, for all valid i,
+        0 <= ob_digit[i] <= PyLong_MASK.
+
+   The allocation function takes care of allocating extra memory
+   so that ob_digit[0] ... ob_digit[ndigits-1] are actually available.
+   We always allocate memory for at least one digit, so accessing ob_digit[0]
+   is always safe. However, in the case ndigits == 0, the contents of
+   ob_digit[0] may be undefined.
+*/
+
+typedef struct _PyLongValue {
+    uintptr_t lv_tag; /* Number of digits, sign and flags */
+    digit ob_digit[1];
+} _PyLongValue;
+
+struct _longobject {
+    PyObject_HEAD
+    _PyLongValue long_value;
+};
+
+PyAPI_FUNC(PyLongObject*) _PyLong_New(Py_ssize_t);
+
+// Return a copy of src.
+PyAPI_FUNC(PyObject*) _PyLong_Copy(PyLongObject *src);
+
+PyAPI_FUNC(PyLongObject*) _PyLong_FromDigits(
+    int negative,
+    Py_ssize_t digit_count,
+    digit *digits);
+
+
+/* Inline some internals for speed. These should be in pycore_long.h
+ * if user code didn't need them inlined. */
+
+#define _PyLong_SIGN_MASK 3
+#define _PyLong_NON_SIZE_BITS 3
+
+
+static inline int
+_PyLong_IsCompact(const PyLongObject* op) {
+    assert(PyType_HasFeature((op)->ob_base.ob_type, Py_TPFLAGS_LONG_SUBCLASS));
+    return op->long_value.lv_tag < (2 << _PyLong_NON_SIZE_BITS);
+}
+
+#define PyUnstable_Long_IsCompact _PyLong_IsCompact
+
+static inline Py_ssize_t
+_PyLong_CompactValue(const PyLongObject *op)
+{
+    Py_ssize_t sign;
+    assert(PyType_HasFeature((op)->ob_base.ob_type, Py_TPFLAGS_LONG_SUBCLASS));
+    assert(PyUnstable_Long_IsCompact(op));
+    sign = 1 - (op->long_value.lv_tag & _PyLong_SIGN_MASK);
+    return sign * (Py_ssize_t)op->long_value.ob_digit[0];
+}
+
+#define PyUnstable_Long_CompactValue _PyLong_CompactValue
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_LONGINTREPR_H */
+#endif /* Py_LIMITED_API */
diff --git a/Include/cpython/longobject.h b/Include/cpython/longobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d49242ff6808cac1e24d1780b51945b3caf1869
--- /dev/null
+++ b/Include/cpython/longobject.h
@@ -0,0 +1,116 @@
+#ifndef Py_CPYTHON_LONGOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(PyObject*) PyLong_FromUnicodeObject(PyObject *u, int base);
+
+#define Py_ASNATIVEBYTES_DEFAULTS -1
+#define Py_ASNATIVEBYTES_BIG_ENDIAN 0
+#define Py_ASNATIVEBYTES_LITTLE_ENDIAN 1
+#define Py_ASNATIVEBYTES_NATIVE_ENDIAN 3
+#define Py_ASNATIVEBYTES_UNSIGNED_BUFFER 4
+#define Py_ASNATIVEBYTES_REJECT_NEGATIVE 8
+#define Py_ASNATIVEBYTES_ALLOW_INDEX 16
+
+/* PyLong_AsNativeBytes: Copy the integer value to a native variable.
+   buffer points to the first byte of the variable.
+   n_bytes is the number of bytes available in the buffer. Pass 0 to request
+   the required size for the value.
+   flags is a bitfield of the following flags:
+   * 1 - little endian
+   * 2 - native endian
+   * 4 - unsigned destination (e.g. don't reject copying 255 into one byte)
+   * 8 - raise an exception for negative inputs
+   * 16 - call __index__ on non-int types
+   If flags is -1 (all bits set), native endian is used, value truncation
+   behaves most like C (allows negative inputs and allow MSB set), and non-int
+   objects will raise a TypeError.
+   Big endian mode will write the most significant byte into the address
+   directly referenced by buffer; little endian will write the least significant
+   byte into that address.
+
+   If an exception is raised, returns a negative value.
+   Otherwise, returns the number of bytes that are required to store the value.
+   To check that the full value is represented, ensure that the return value is
+   equal or less than n_bytes.
+   All n_bytes are guaranteed to be written (unless an exception occurs), and
+   so ignoring a positive return value is the equivalent of a downcast in C.
+   In cases where the full value could not be represented, the returned value
+   may be larger than necessary - this function is not an accurate way to
+   calculate the bit length of an integer object.
+   */
+PyAPI_FUNC(Py_ssize_t) PyLong_AsNativeBytes(PyObject* v, void* buffer,
+    Py_ssize_t n_bytes, int flags);
+
+/* PyLong_FromNativeBytes: Create an int value from a native integer
+   n_bytes is the number of bytes to read from the buffer. Passing 0 will
+   always produce the zero int.
+   PyLong_FromUnsignedNativeBytes always produces a non-negative int.
+   flags is the same as for PyLong_AsNativeBytes, but only supports selecting
+   the endianness or forcing an unsigned buffer.
+
+   Returns the int object, or NULL with an exception set. */
+PyAPI_FUNC(PyObject*) PyLong_FromNativeBytes(const void* buffer, size_t n_bytes,
+    int flags);
+PyAPI_FUNC(PyObject*) PyLong_FromUnsignedNativeBytes(const void* buffer,
+    size_t n_bytes, int flags);
+
+PyAPI_FUNC(int) PyUnstable_Long_IsCompact(const PyLongObject* op);
+PyAPI_FUNC(Py_ssize_t) PyUnstable_Long_CompactValue(const PyLongObject* op);
+
+// _PyLong_Sign.  Return 0 if v is 0, -1 if v < 0, +1 if v > 0.
+// v must not be NULL, and must be a normalized long.
+// There are no error cases.
+PyAPI_FUNC(int) _PyLong_Sign(PyObject *v);
+
+/* _PyLong_NumBits.  Return the number of bits needed to represent the
+   absolute value of a long.  For example, this returns 1 for 1 and -1, 2
+   for 2 and -2, and 2 for 3 and -3.  It returns 0 for 0.
+   v must not be NULL, and must be a normalized long.
+   (size_t)-1 is returned and OverflowError set if the true result doesn't
+   fit in a size_t.
+*/
+PyAPI_FUNC(size_t) _PyLong_NumBits(PyObject *v);
+
+/* _PyLong_FromByteArray:  View the n unsigned bytes as a binary integer in
+   base 256, and return a Python int with the same numeric value.
+   If n is 0, the integer is 0.  Else:
+   If little_endian is 1/true, bytes[n-1] is the MSB and bytes[0] the LSB;
+   else (little_endian is 0/false) bytes[0] is the MSB and bytes[n-1] the
+   LSB.
+   If is_signed is 0/false, view the bytes as a non-negative integer.
+   If is_signed is 1/true, view the bytes as a 2's-complement integer,
+   non-negative if bit 0x80 of the MSB is clear, negative if set.
+   Error returns:
+   + Return NULL with the appropriate exception set if there's not
+     enough memory to create the Python int.
+*/
+PyAPI_FUNC(PyObject *) _PyLong_FromByteArray(
+    const unsigned char* bytes, size_t n,
+    int little_endian, int is_signed);
+
+/* _PyLong_AsByteArray: Convert the least-significant 8*n bits of long
+   v to a base-256 integer, stored in array bytes.  Normally return 0,
+   return -1 on error.
+   If little_endian is 1/true, store the MSB at bytes[n-1] and the LSB at
+   bytes[0]; else (little_endian is 0/false) store the MSB at bytes[0] and
+   the LSB at bytes[n-1].
+   If is_signed is 0/false, it's an error if v < 0; else (v >= 0) n bytes
+   are filled and there's nothing special about bit 0x80 of the MSB.
+   If is_signed is 1/true, bytes is filled with the 2's-complement
+   representation of v's value.  Bit 0x80 of the MSB is the sign bit.
+   Error returns (-1):
+   + is_signed is 0 and v < 0.  TypeError is set in this case, and bytes
+     isn't altered.
+   + n isn't big enough to hold the full mathematical value of v.  For
+     example, if is_signed is 0 and there are more digits in the v than
+     fit in n; or if is_signed is 1, v < 0, and n is just 1 bit shy of
+     being large enough to hold a sign bit.  OverflowError is set in this
+     case, but bytes holds the least-significant n bytes of the true value.
+*/
+PyAPI_FUNC(int) _PyLong_AsByteArray(PyLongObject* v,
+    unsigned char* bytes, size_t n,
+    int little_endian, int is_signed, int with_exceptions);
+
+/* For use by the gcd function in mathmodule.c */
+PyAPI_FUNC(PyObject *) _PyLong_GCD(PyObject *, PyObject *);
diff --git a/Include/cpython/memoryobject.h b/Include/cpython/memoryobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..961161b70f20580b9d1c1408b3b58aebf01214ec
--- /dev/null
+++ b/Include/cpython/memoryobject.h
@@ -0,0 +1,50 @@
+#ifndef Py_CPYTHON_MEMORYOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* The structs are declared here so that macros can work, but they shouldn't
+   be considered public. Don't access their fields directly, use the macros
+   and functions instead! */
+#define _Py_MANAGED_BUFFER_RELEASED    0x001  /* access to exporter blocked */
+#define _Py_MANAGED_BUFFER_FREE_FORMAT 0x002  /* free format */
+
+typedef struct {
+    PyObject_HEAD
+    int flags;          /* state flags */
+    Py_ssize_t exports; /* number of direct memoryview exports */
+    Py_buffer master; /* snapshot buffer obtained from the original exporter */
+} _PyManagedBufferObject;
+
+
+/* memoryview state flags */
+#define _Py_MEMORYVIEW_RELEASED    0x001  /* access to master buffer blocked */
+#define _Py_MEMORYVIEW_C           0x002  /* C-contiguous layout */
+#define _Py_MEMORYVIEW_FORTRAN     0x004  /* Fortran contiguous layout */
+#define _Py_MEMORYVIEW_SCALAR      0x008  /* scalar: ndim = 0 */
+#define _Py_MEMORYVIEW_PIL         0x010  /* PIL-style layout */
+#define _Py_MEMORYVIEW_RESTRICTED  0x020  /* Disallow new references to the memoryview's buffer */
+
+typedef struct {
+    PyObject_VAR_HEAD
+    _PyManagedBufferObject *mbuf; /* managed buffer */
+    Py_hash_t hash;               /* hash value for read-only views */
+    int flags;                    /* state flags */
+    Py_ssize_t exports;           /* number of buffer re-exports */
+    Py_buffer view;               /* private copy of the exporter's view */
+    PyObject *weakreflist;
+    Py_ssize_t ob_array[1];       /* shape, strides, suboffsets */
+} PyMemoryViewObject;
+
+#define _PyMemoryView_CAST(op) _Py_CAST(PyMemoryViewObject*, op)
+
+/* Get a pointer to the memoryview's private copy of the exporter's buffer. */
+static inline Py_buffer* PyMemoryView_GET_BUFFER(PyObject *op) {
+    return (&_PyMemoryView_CAST(op)->view);
+}
+#define PyMemoryView_GET_BUFFER(op) PyMemoryView_GET_BUFFER(_PyObject_CAST(op))
+
+/* Get a pointer to the exporting object (this may be NULL!). */
+static inline PyObject* PyMemoryView_GET_BASE(PyObject *op) {
+    return _PyMemoryView_CAST(op)->view.obj;
+}
+#define PyMemoryView_GET_BASE(op) PyMemoryView_GET_BASE(_PyObject_CAST(op))
diff --git a/Include/cpython/methodobject.h b/Include/cpython/methodobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..d541e1549480417fbce8933e222821ef8aab6247
--- /dev/null
+++ b/Include/cpython/methodobject.h
@@ -0,0 +1,66 @@
+#ifndef Py_CPYTHON_METHODOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+// PyCFunctionObject structure
+
+typedef struct {
+    PyObject_HEAD
+    PyMethodDef *m_ml; /* Description of the C function to call */
+    PyObject    *m_self; /* Passed as 'self' arg to the C func, can be NULL */
+    PyObject    *m_module; /* The __module__ attribute, can be anything */
+    PyObject    *m_weakreflist; /* List of weak references */
+    vectorcallfunc vectorcall;
+} PyCFunctionObject;
+
+#define _PyCFunctionObject_CAST(func) \
+    (assert(PyCFunction_Check(func)), \
+     _Py_CAST(PyCFunctionObject*, (func)))
+
+
+// PyCMethodObject structure
+
+typedef struct {
+    PyCFunctionObject func;
+    PyTypeObject *mm_class; /* Class that defines this method */
+} PyCMethodObject;
+
+#define _PyCMethodObject_CAST(func) \
+    (assert(PyCMethod_Check(func)), \
+     _Py_CAST(PyCMethodObject*, (func)))
+
+PyAPI_DATA(PyTypeObject) PyCMethod_Type;
+
+#define PyCMethod_CheckExact(op) Py_IS_TYPE((op), &PyCMethod_Type)
+#define PyCMethod_Check(op) PyObject_TypeCheck((op), &PyCMethod_Type)
+
+
+/* Static inline functions for direct access to these values.
+   Type checks are *not* done, so use with care. */
+static inline PyCFunction PyCFunction_GET_FUNCTION(PyObject *func) {
+    return _PyCFunctionObject_CAST(func)->m_ml->ml_meth;
+}
+#define PyCFunction_GET_FUNCTION(func) PyCFunction_GET_FUNCTION(_PyObject_CAST(func))
+
+static inline PyObject* PyCFunction_GET_SELF(PyObject *func_obj) {
+    PyCFunctionObject *func = _PyCFunctionObject_CAST(func_obj);
+    if (func->m_ml->ml_flags & METH_STATIC) {
+        return _Py_NULL;
+    }
+    return func->m_self;
+}
+#define PyCFunction_GET_SELF(func) PyCFunction_GET_SELF(_PyObject_CAST(func))
+
+static inline int PyCFunction_GET_FLAGS(PyObject *func) {
+    return _PyCFunctionObject_CAST(func)->m_ml->ml_flags;
+}
+#define PyCFunction_GET_FLAGS(func) PyCFunction_GET_FLAGS(_PyObject_CAST(func))
+
+static inline PyTypeObject* PyCFunction_GET_CLASS(PyObject *func_obj) {
+    PyCFunctionObject *func = _PyCFunctionObject_CAST(func_obj);
+    if (func->m_ml->ml_flags & METH_METHOD) {
+        return _PyCMethodObject_CAST(func)->mm_class;
+    }
+    return _Py_NULL;
+}
+#define PyCFunction_GET_CLASS(func) PyCFunction_GET_CLASS(_PyObject_CAST(func))
diff --git a/Include/cpython/modsupport.h b/Include/cpython/modsupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3b88f58c82ca3e923cb7756871113f1dfcafec7
--- /dev/null
+++ b/Include/cpython/modsupport.h
@@ -0,0 +1,26 @@
+#ifndef Py_CPYTHON_MODSUPPORT_H
+#  error "this header file must not be included directly"
+#endif
+
+// A data structure that can be used to run initialization code once in a
+// thread-safe manner. The C++11 equivalent is std::call_once.
+typedef struct {
+    uint8_t v;
+} _PyOnceFlag;
+
+typedef struct _PyArg_Parser {
+    const char *format;
+    const char * const *keywords;
+    const char *fname;
+    const char *custom_msg;
+    _PyOnceFlag once;       /* atomic one-time initialization flag */
+    int is_kwtuple_owned;   /* does this parser own the kwtuple object? */
+    int pos;                /* number of positional-only arguments */
+    int min;                /* minimal number of arguments */
+    int max;                /* maximal number of positional arguments */
+    PyObject *kwtuple;      /* tuple of keyword parameter names */
+    struct _PyArg_Parser *next;
+} _PyArg_Parser;
+
+PyAPI_FUNC(int) _PyArg_ParseTupleAndKeywordsFast(PyObject *, PyObject *,
+                                                 struct _PyArg_Parser *, ...);
diff --git a/Include/cpython/monitoring.h b/Include/cpython/monitoring.h
new file mode 100644
index 0000000000000000000000000000000000000000..797ba51246b1c63e5ee3a765c7742751e5acd002
--- /dev/null
+++ b/Include/cpython/monitoring.h
@@ -0,0 +1,250 @@
+#ifndef Py_CPYTHON_MONITORING_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Local events.
+ * These require bytecode instrumentation */
+
+#define PY_MONITORING_EVENT_PY_START 0
+#define PY_MONITORING_EVENT_PY_RESUME 1
+#define PY_MONITORING_EVENT_PY_RETURN 2
+#define PY_MONITORING_EVENT_PY_YIELD 3
+#define PY_MONITORING_EVENT_CALL 4
+#define PY_MONITORING_EVENT_LINE 5
+#define PY_MONITORING_EVENT_INSTRUCTION 6
+#define PY_MONITORING_EVENT_JUMP 7
+#define PY_MONITORING_EVENT_BRANCH 8
+#define PY_MONITORING_EVENT_STOP_ITERATION 9
+
+#define PY_MONITORING_IS_INSTRUMENTED_EVENT(ev) \
+    ((ev) < _PY_MONITORING_LOCAL_EVENTS)
+
+/* Other events, mainly exceptions */
+
+#define PY_MONITORING_EVENT_RAISE 10
+#define PY_MONITORING_EVENT_EXCEPTION_HANDLED 11
+#define PY_MONITORING_EVENT_PY_UNWIND 12
+#define PY_MONITORING_EVENT_PY_THROW 13
+#define PY_MONITORING_EVENT_RERAISE 14
+
+
+/* Ancillary events */
+
+#define PY_MONITORING_EVENT_C_RETURN 15
+#define PY_MONITORING_EVENT_C_RAISE 16
+
+
+typedef struct _PyMonitoringState {
+    uint8_t active;
+    uint8_t opaque;
+} PyMonitoringState;
+
+
+PyAPI_FUNC(int)
+PyMonitoring_EnterScope(PyMonitoringState *state_array, uint64_t *version,
+                         const uint8_t *event_types, Py_ssize_t length);
+
+PyAPI_FUNC(int)
+PyMonitoring_ExitScope(void);
+
+
+PyAPI_FUNC(int)
+_PyMonitoring_FirePyStartEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FirePyResumeEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FirePyReturnEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                                PyObject *retval);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FirePyYieldEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                               PyObject *retval);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FireCallEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                            PyObject* callable, PyObject *arg0);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FireLineEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                            int lineno);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FireJumpEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                            PyObject *target_offset);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FireBranchEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                              PyObject *target_offset);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FireCReturnEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                               PyObject *retval);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FirePyThrowEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FireRaiseEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FireReraiseEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FireExceptionHandledEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FireCRaiseEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FirePyUnwindEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset);
+
+PyAPI_FUNC(int)
+_PyMonitoring_FireStopIterationEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset, PyObject *value);
+
+
+#define _PYMONITORING_IF_ACTIVE(STATE, X)  \
+    if ((STATE)->active) { \
+        return (X); \
+    } \
+    else { \
+        return 0; \
+    }
+
+static inline int
+PyMonitoring_FirePyStartEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FirePyStartEvent(state, codelike, offset));
+}
+
+static inline int
+PyMonitoring_FirePyResumeEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FirePyResumeEvent(state, codelike, offset));
+}
+
+static inline int
+PyMonitoring_FirePyReturnEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                               PyObject *retval)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FirePyReturnEvent(state, codelike, offset, retval));
+}
+
+static inline int
+PyMonitoring_FirePyYieldEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                              PyObject *retval)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FirePyYieldEvent(state, codelike, offset, retval));
+}
+
+static inline int
+PyMonitoring_FireCallEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                           PyObject* callable, PyObject *arg0)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FireCallEvent(state, codelike, offset, callable, arg0));
+}
+
+static inline int
+PyMonitoring_FireLineEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                           int lineno)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FireLineEvent(state, codelike, offset, lineno));
+}
+
+static inline int
+PyMonitoring_FireJumpEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                           PyObject *target_offset)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FireJumpEvent(state, codelike, offset, target_offset));
+}
+
+static inline int
+PyMonitoring_FireBranchEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                             PyObject *target_offset)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FireBranchEvent(state, codelike, offset, target_offset));
+}
+
+static inline int
+PyMonitoring_FireCReturnEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset,
+                              PyObject *retval)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FireCReturnEvent(state, codelike, offset, retval));
+}
+
+static inline int
+PyMonitoring_FirePyThrowEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FirePyThrowEvent(state, codelike, offset));
+}
+
+static inline int
+PyMonitoring_FireRaiseEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FireRaiseEvent(state, codelike, offset));
+}
+
+static inline int
+PyMonitoring_FireReraiseEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FireReraiseEvent(state, codelike, offset));
+}
+
+static inline int
+PyMonitoring_FireExceptionHandledEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FireExceptionHandledEvent(state, codelike, offset));
+}
+
+static inline int
+PyMonitoring_FireCRaiseEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FireCRaiseEvent(state, codelike, offset));
+}
+
+static inline int
+PyMonitoring_FirePyUnwindEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FirePyUnwindEvent(state, codelike, offset));
+}
+
+static inline int
+PyMonitoring_FireStopIterationEvent(PyMonitoringState *state, PyObject *codelike, int32_t offset, PyObject *value)
+{
+    _PYMONITORING_IF_ACTIVE(
+        state,
+        _PyMonitoring_FireStopIterationEvent(state, codelike, offset, value));
+}
+
+#undef _PYMONITORING_IF_ACTIVE
diff --git a/Include/cpython/object.h b/Include/cpython/object.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cb2c40fe2eb71b38716245f977666e26dd928a6
--- /dev/null
+++ b/Include/cpython/object.h
@@ -0,0 +1,525 @@
+#ifndef Py_CPYTHON_OBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(void) _Py_NewReference(PyObject *op);
+PyAPI_FUNC(void) _Py_NewReferenceNoTotal(PyObject *op);
+PyAPI_FUNC(void) _Py_ResurrectReference(PyObject *op);
+
+#ifdef Py_REF_DEBUG
+/* These are useful as debugging aids when chasing down refleaks. */
+PyAPI_FUNC(Py_ssize_t) _Py_GetGlobalRefTotal(void);
+#  define _Py_GetRefTotal() _Py_GetGlobalRefTotal()
+PyAPI_FUNC(Py_ssize_t) _Py_GetLegacyRefTotal(void);
+PyAPI_FUNC(Py_ssize_t) _PyInterpreterState_GetRefTotal(PyInterpreterState *);
+#endif
+
+
+/********************* String Literals ****************************************/
+/* This structure helps managing static strings. The basic usage goes like this:
+   Instead of doing
+
+       r = PyObject_CallMethod(o, "foo", "args", ...);
+
+   do
+
+       _Py_IDENTIFIER(foo);
+       ...
+       r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
+
+   PyId_foo is a static variable, either on block level or file level. On first
+   usage, the string "foo" is interned, and the structures are linked. On interpreter
+   shutdown, all strings are released.
+
+   Alternatively, _Py_static_string allows choosing the variable name.
+   _PyUnicode_FromId returns a borrowed reference to the interned string.
+   _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
+*/
+typedef struct _Py_Identifier {
+    const char* string;
+    // Index in PyInterpreterState.unicode.ids.array. It is process-wide
+    // unique and must be initialized to -1.
+    Py_ssize_t index;
+    // Hidden PyMutex struct for non free-threaded build.
+    struct {
+        uint8_t v;
+    } mutex;
+} _Py_Identifier;
+
+#ifndef Py_BUILD_CORE
+// For now we are keeping _Py_IDENTIFIER for continued use
+// in non-builtin extensions (and naughty PyPI modules).
+
+#define _Py_static_string_init(value) { .string = (value), .index = -1 }
+#define _Py_static_string(varname, value)  static _Py_Identifier varname = _Py_static_string_init(value)
+#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
+
+#endif /* !Py_BUILD_CORE */
+
+
+typedef struct {
+    /* Number implementations must check *both*
+       arguments for proper type and implement the necessary conversions
+       in the slot functions themselves. */
+
+    binaryfunc nb_add;
+    binaryfunc nb_subtract;
+    binaryfunc nb_multiply;
+    binaryfunc nb_remainder;
+    binaryfunc nb_divmod;
+    ternaryfunc nb_power;
+    unaryfunc nb_negative;
+    unaryfunc nb_positive;
+    unaryfunc nb_absolute;
+    inquiry nb_bool;
+    unaryfunc nb_invert;
+    binaryfunc nb_lshift;
+    binaryfunc nb_rshift;
+    binaryfunc nb_and;
+    binaryfunc nb_xor;
+    binaryfunc nb_or;
+    unaryfunc nb_int;
+    void *nb_reserved;  /* the slot formerly known as nb_long */
+    unaryfunc nb_float;
+
+    binaryfunc nb_inplace_add;
+    binaryfunc nb_inplace_subtract;
+    binaryfunc nb_inplace_multiply;
+    binaryfunc nb_inplace_remainder;
+    ternaryfunc nb_inplace_power;
+    binaryfunc nb_inplace_lshift;
+    binaryfunc nb_inplace_rshift;
+    binaryfunc nb_inplace_and;
+    binaryfunc nb_inplace_xor;
+    binaryfunc nb_inplace_or;
+
+    binaryfunc nb_floor_divide;
+    binaryfunc nb_true_divide;
+    binaryfunc nb_inplace_floor_divide;
+    binaryfunc nb_inplace_true_divide;
+
+    unaryfunc nb_index;
+
+    binaryfunc nb_matrix_multiply;
+    binaryfunc nb_inplace_matrix_multiply;
+} PyNumberMethods;
+
+typedef struct {
+    lenfunc sq_length;
+    binaryfunc sq_concat;
+    ssizeargfunc sq_repeat;
+    ssizeargfunc sq_item;
+    void *was_sq_slice;
+    ssizeobjargproc sq_ass_item;
+    void *was_sq_ass_slice;
+    objobjproc sq_contains;
+
+    binaryfunc sq_inplace_concat;
+    ssizeargfunc sq_inplace_repeat;
+} PySequenceMethods;
+
+typedef struct {
+    lenfunc mp_length;
+    binaryfunc mp_subscript;
+    objobjargproc mp_ass_subscript;
+} PyMappingMethods;
+
+typedef PySendResult (*sendfunc)(PyObject *iter, PyObject *value, PyObject **result);
+
+typedef struct {
+    unaryfunc am_await;
+    unaryfunc am_aiter;
+    unaryfunc am_anext;
+    sendfunc am_send;
+} PyAsyncMethods;
+
+typedef struct {
+     getbufferproc bf_getbuffer;
+     releasebufferproc bf_releasebuffer;
+} PyBufferProcs;
+
+/* Allow printfunc in the tp_vectorcall_offset slot for
+ * backwards-compatibility */
+typedef Py_ssize_t printfunc;
+
+// If this structure is modified, Doc/includes/typestruct.h should be updated
+// as well.
+struct _typeobject {
+    PyObject_VAR_HEAD
+    const char *tp_name; /* For printing, in format "<module>.<name>" */
+    Py_ssize_t tp_basicsize, tp_itemsize; /* For allocation */
+
+    /* Methods to implement standard operations */
+
+    destructor tp_dealloc;
+    Py_ssize_t tp_vectorcall_offset;
+    getattrfunc tp_getattr;
+    setattrfunc tp_setattr;
+    PyAsyncMethods *tp_as_async; /* formerly known as tp_compare (Python 2)
+                                    or tp_reserved (Python 3) */
+    reprfunc tp_repr;
+
+    /* Method suites for standard classes */
+
+    PyNumberMethods *tp_as_number;
+    PySequenceMethods *tp_as_sequence;
+    PyMappingMethods *tp_as_mapping;
+
+    /* More standard operations (here for binary compatibility) */
+
+    hashfunc tp_hash;
+    ternaryfunc tp_call;
+    reprfunc tp_str;
+    getattrofunc tp_getattro;
+    setattrofunc tp_setattro;
+
+    /* Functions to access object as input/output buffer */
+    PyBufferProcs *tp_as_buffer;
+
+    /* Flags to define presence of optional/expanded features */
+    unsigned long tp_flags;
+
+    const char *tp_doc; /* Documentation string */
+
+    /* Assigned meaning in release 2.0 */
+    /* call function for all accessible objects */
+    traverseproc tp_traverse;
+
+    /* delete references to contained objects */
+    inquiry tp_clear;
+
+    /* Assigned meaning in release 2.1 */
+    /* rich comparisons */
+    richcmpfunc tp_richcompare;
+
+    /* weak reference enabler */
+    Py_ssize_t tp_weaklistoffset;
+
+    /* Iterators */
+    getiterfunc tp_iter;
+    iternextfunc tp_iternext;
+
+    /* Attribute descriptor and subclassing stuff */
+    PyMethodDef *tp_methods;
+    PyMemberDef *tp_members;
+    PyGetSetDef *tp_getset;
+    // Strong reference on a heap type, borrowed reference on a static type
+    PyTypeObject *tp_base;
+    PyObject *tp_dict;
+    descrgetfunc tp_descr_get;
+    descrsetfunc tp_descr_set;
+    Py_ssize_t tp_dictoffset;
+    initproc tp_init;
+    allocfunc tp_alloc;
+    newfunc tp_new;
+    freefunc tp_free; /* Low-level free-memory routine */
+    inquiry tp_is_gc; /* For PyObject_IS_GC */
+    PyObject *tp_bases;
+    PyObject *tp_mro; /* method resolution order */
+    PyObject *tp_cache; /* no longer used */
+    void *tp_subclasses;  /* for static builtin types this is an index */
+    PyObject *tp_weaklist; /* not used for static builtin types */
+    destructor tp_del;
+
+    /* Type attribute cache version tag. Added in version 2.6 */
+    unsigned int tp_version_tag;
+
+    destructor tp_finalize;
+    vectorcallfunc tp_vectorcall;
+
+    /* bitset of which type-watchers care about this type */
+    unsigned char tp_watched;
+    uint16_t tp_versions_used;
+};
+
+/* This struct is used by the specializer
+ * It should be treated as an opaque blob
+ * by code other than the specializer and interpreter. */
+struct _specialization_cache {
+    // In order to avoid bloating the bytecode with lots of inline caches, the
+    // members of this structure have a somewhat unique contract. They are set
+    // by the specialization machinery, and are invalidated by PyType_Modified.
+    // The rules for using them are as follows:
+    // - If getitem is non-NULL, then it is the same Python function that
+    //   PyType_Lookup(cls, "__getitem__") would return.
+    // - If getitem is NULL, then getitem_version is meaningless.
+    // - If getitem->func_version == getitem_version, then getitem can be called
+    //   with two positional arguments and no keyword arguments, and has neither
+    //   *args nor **kwargs (as required by BINARY_SUBSCR_GETITEM):
+    PyObject *getitem;
+    uint32_t getitem_version;
+    PyObject *init;
+};
+
+/* The *real* layout of a type object when allocated on the heap */
+typedef struct _heaptypeobject {
+    /* Note: there's a dependency on the order of these members
+       in slotptr() in typeobject.c . */
+    PyTypeObject ht_type;
+    PyAsyncMethods as_async;
+    PyNumberMethods as_number;
+    PyMappingMethods as_mapping;
+    PySequenceMethods as_sequence; /* as_sequence comes after as_mapping,
+                                      so that the mapping wins when both
+                                      the mapping and the sequence define
+                                      a given operator (e.g. __getitem__).
+                                      see add_operators() in typeobject.c . */
+    PyBufferProcs as_buffer;
+    PyObject *ht_name, *ht_slots, *ht_qualname;
+    struct _dictkeysobject *ht_cached_keys;
+    PyObject *ht_module;
+    char *_ht_tpname;  // Storage for "tp_name"; see PyType_FromModuleAndSpec
+    struct _specialization_cache _spec_cache; // For use by the specializer.
+    /* here are optional user slots, followed by the members. */
+} PyHeapTypeObject;
+
+PyAPI_FUNC(const char *) _PyType_Name(PyTypeObject *);
+PyAPI_FUNC(PyObject *) _PyType_Lookup(PyTypeObject *, PyObject *);
+PyAPI_FUNC(PyObject *) _PyType_LookupRef(PyTypeObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyType_GetDict(PyTypeObject *);
+
+PyAPI_FUNC(int) PyObject_Print(PyObject *, FILE *, int);
+PyAPI_FUNC(void) _Py_BreakPoint(void);
+PyAPI_FUNC(void) _PyObject_Dump(PyObject *);
+
+PyAPI_FUNC(PyObject*) _PyObject_GetAttrId(PyObject *, _Py_Identifier *);
+
+PyAPI_FUNC(PyObject **) _PyObject_GetDictPtr(PyObject *);
+PyAPI_FUNC(void) PyObject_CallFinalizer(PyObject *);
+PyAPI_FUNC(int) PyObject_CallFinalizerFromDealloc(PyObject *);
+
+PyAPI_FUNC(void) PyUnstable_Object_ClearWeakRefsNoCallbacks(PyObject *);
+
+/* Same as PyObject_Generic{Get,Set}Attr, but passing the attributes
+   dict as the last parameter. */
+PyAPI_FUNC(PyObject *)
+_PyObject_GenericGetAttrWithDict(PyObject *, PyObject *, PyObject *, int);
+PyAPI_FUNC(int)
+_PyObject_GenericSetAttrWithDict(PyObject *, PyObject *,
+                                 PyObject *, PyObject *);
+
+PyAPI_FUNC(PyObject *) _PyObject_FunctionStr(PyObject *);
+
+/* Safely decref `dst` and set `dst` to `src`.
+ *
+ * As in case of Py_CLEAR "the obvious" code can be deadly:
+ *
+ *     Py_DECREF(dst);
+ *     dst = src;
+ *
+ * The safe way is:
+ *
+ *      Py_SETREF(dst, src);
+ *
+ * That arranges to set `dst` to `src` _before_ decref'ing, so that any code
+ * triggered as a side-effect of `dst` getting torn down no longer believes
+ * `dst` points to a valid object.
+ *
+ * Temporary variables are used to only evalutate macro arguments once and so
+ * avoid the duplication of side effects. _Py_TYPEOF() or memcpy() is used to
+ * avoid a miscompilation caused by type punning. See Py_CLEAR() comment for
+ * implementation details about type punning.
+ *
+ * The memcpy() implementation does not emit a compiler warning if 'src' has
+ * not the same type than 'src': any pointer type is accepted for 'src'.
+ */
+#ifdef _Py_TYPEOF
+#define Py_SETREF(dst, src) \
+    do { \
+        _Py_TYPEOF(dst)* _tmp_dst_ptr = &(dst); \
+        _Py_TYPEOF(dst) _tmp_old_dst = (*_tmp_dst_ptr); \
+        *_tmp_dst_ptr = (src); \
+        Py_DECREF(_tmp_old_dst); \
+    } while (0)
+#else
+#define Py_SETREF(dst, src) \
+    do { \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_old_dst = (*_tmp_dst_ptr); \
+        PyObject *_tmp_src = _PyObject_CAST(src); \
+        memcpy(_tmp_dst_ptr, &_tmp_src, sizeof(PyObject*)); \
+        Py_DECREF(_tmp_old_dst); \
+    } while (0)
+#endif
+
+/* Py_XSETREF() is a variant of Py_SETREF() that uses Py_XDECREF() instead of
+ * Py_DECREF().
+ */
+#ifdef _Py_TYPEOF
+#define Py_XSETREF(dst, src) \
+    do { \
+        _Py_TYPEOF(dst)* _tmp_dst_ptr = &(dst); \
+        _Py_TYPEOF(dst) _tmp_old_dst = (*_tmp_dst_ptr); \
+        *_tmp_dst_ptr = (src); \
+        Py_XDECREF(_tmp_old_dst); \
+    } while (0)
+#else
+#define Py_XSETREF(dst, src) \
+    do { \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_old_dst = (*_tmp_dst_ptr); \
+        PyObject *_tmp_src = _PyObject_CAST(src); \
+        memcpy(_tmp_dst_ptr, &_tmp_src, sizeof(PyObject*)); \
+        Py_XDECREF(_tmp_old_dst); \
+    } while (0)
+#endif
+
+
+/* Define a pair of assertion macros:
+   _PyObject_ASSERT_FROM(), _PyObject_ASSERT_WITH_MSG() and _PyObject_ASSERT().
+
+   These work like the regular C assert(), in that they will abort the
+   process with a message on stderr if the given condition fails to hold,
+   but compile away to nothing if NDEBUG is defined.
+
+   However, before aborting, Python will also try to call _PyObject_Dump() on
+   the given object.  This may be of use when investigating bugs in which a
+   particular object is corrupt (e.g. buggy a tp_visit method in an extension
+   module breaking the garbage collector), to help locate the broken objects.
+
+   The WITH_MSG variant allows you to supply an additional message that Python
+   will attempt to print to stderr, after the object dump. */
+#ifdef NDEBUG
+   /* No debugging: compile away the assertions: */
+#  define _PyObject_ASSERT_FROM(obj, expr, msg, filename, lineno, func) \
+    ((void)0)
+#else
+   /* With debugging: generate checks: */
+#  define _PyObject_ASSERT_FROM(obj, expr, msg, filename, lineno, func) \
+    ((expr) \
+      ? (void)(0) \
+      : _PyObject_AssertFailed((obj), Py_STRINGIFY(expr), \
+                               (msg), (filename), (lineno), (func)))
+#endif
+
+#define _PyObject_ASSERT_WITH_MSG(obj, expr, msg) \
+    _PyObject_ASSERT_FROM((obj), expr, (msg), __FILE__, __LINE__, __func__)
+#define _PyObject_ASSERT(obj, expr) \
+    _PyObject_ASSERT_WITH_MSG((obj), expr, NULL)
+
+#define _PyObject_ASSERT_FAILED_MSG(obj, msg) \
+    _PyObject_AssertFailed((obj), NULL, (msg), __FILE__, __LINE__, __func__)
+
+/* Declare and define _PyObject_AssertFailed() even when NDEBUG is defined,
+   to avoid causing compiler/linker errors when building extensions without
+   NDEBUG against a Python built with NDEBUG defined.
+
+   msg, expr and function can be NULL. */
+PyAPI_FUNC(void) _Py_NO_RETURN _PyObject_AssertFailed(
+    PyObject *obj,
+    const char *expr,
+    const char *msg,
+    const char *file,
+    int line,
+    const char *function);
+
+
+/* Trashcan mechanism, thanks to Christian Tismer.
+
+When deallocating a container object, it's possible to trigger an unbounded
+chain of deallocations, as each Py_DECREF in turn drops the refcount on "the
+next" object in the chain to 0.  This can easily lead to stack overflows,
+especially in threads (which typically have less stack space to work with).
+
+A container object can avoid this by bracketing the body of its tp_dealloc
+function with a pair of macros:
+
+static void
+mytype_dealloc(mytype *p)
+{
+    ... declarations go here ...
+
+    PyObject_GC_UnTrack(p);        // must untrack first
+    Py_TRASHCAN_BEGIN(p, mytype_dealloc)
+    ... The body of the deallocator goes here, including all calls ...
+    ... to Py_DECREF on contained objects.                         ...
+    Py_TRASHCAN_END                // there should be no code after this
+}
+
+CAUTION:  Never return from the middle of the body!  If the body needs to
+"get out early", put a label immediately before the Py_TRASHCAN_END
+call, and goto it.  Else the call-depth counter (see below) will stay
+above 0 forever, and the trashcan will never get emptied.
+
+How it works:  The BEGIN macro increments a call-depth counter.  So long
+as this counter is small, the body of the deallocator is run directly without
+further ado.  But if the counter gets large, it instead adds p to a list of
+objects to be deallocated later, skips the body of the deallocator, and
+resumes execution after the END macro.  The tp_dealloc routine then returns
+without deallocating anything (and so unbounded call-stack depth is avoided).
+
+When the call stack finishes unwinding again, code generated by the END macro
+notices this, and calls another routine to deallocate all the objects that
+may have been added to the list of deferred deallocations.  In effect, a
+chain of N deallocations is broken into (N-1)/(Py_TRASHCAN_HEADROOM-1) pieces,
+with the call stack never exceeding a depth of Py_TRASHCAN_HEADROOM.
+
+Since the tp_dealloc of a subclass typically calls the tp_dealloc of the base
+class, we need to ensure that the trashcan is only triggered on the tp_dealloc
+of the actual class being deallocated. Otherwise we might end up with a
+partially-deallocated object. To check this, the tp_dealloc function must be
+passed as second argument to Py_TRASHCAN_BEGIN().
+*/
+
+/* Python 3.9 private API, invoked by the macros below. */
+PyAPI_FUNC(int) _PyTrash_begin(PyThreadState *tstate, PyObject *op);
+PyAPI_FUNC(void) _PyTrash_end(PyThreadState *tstate);
+
+PyAPI_FUNC(void) _PyTrash_thread_deposit_object(PyThreadState *tstate, PyObject *op);
+PyAPI_FUNC(void) _PyTrash_thread_destroy_chain(PyThreadState *tstate);
+
+
+/* Python 3.10 private API, invoked by the Py_TRASHCAN_BEGIN(). */
+
+/* To avoid raising recursion errors during dealloc trigger trashcan before we reach
+ * recursion limit. To avoid trashing, we don't attempt to empty the trashcan until
+ * we have headroom above the trigger limit */
+#define Py_TRASHCAN_HEADROOM 50
+
+#define Py_TRASHCAN_BEGIN(op, dealloc) \
+do { \
+    PyThreadState *tstate = PyThreadState_Get(); \
+    if (tstate->c_recursion_remaining <= Py_TRASHCAN_HEADROOM && Py_TYPE(op)->tp_dealloc == (destructor)dealloc) { \
+        _PyTrash_thread_deposit_object(tstate, (PyObject *)op); \
+        break; \
+    } \
+    tstate->c_recursion_remaining--;
+    /* The body of the deallocator is here. */
+#define Py_TRASHCAN_END \
+    tstate->c_recursion_remaining++; \
+    if (tstate->delete_later && tstate->c_recursion_remaining > (Py_TRASHCAN_HEADROOM*2)) { \
+        _PyTrash_thread_destroy_chain(tstate); \
+    } \
+} while (0);
+
+
+PyAPI_FUNC(void *) PyObject_GetItemData(PyObject *obj);
+
+PyAPI_FUNC(int) PyObject_VisitManagedDict(PyObject *obj, visitproc visit, void *arg);
+PyAPI_FUNC(int) _PyObject_SetManagedDict(PyObject *obj, PyObject *new_dict);
+PyAPI_FUNC(void) PyObject_ClearManagedDict(PyObject *obj);
+
+#define TYPE_MAX_WATCHERS 8
+
+typedef int(*PyType_WatchCallback)(PyTypeObject *);
+PyAPI_FUNC(int) PyType_AddWatcher(PyType_WatchCallback callback);
+PyAPI_FUNC(int) PyType_ClearWatcher(int watcher_id);
+PyAPI_FUNC(int) PyType_Watch(int watcher_id, PyObject *type);
+PyAPI_FUNC(int) PyType_Unwatch(int watcher_id, PyObject *type);
+
+/* Attempt to assign a version tag to the given type.
+ *
+ * Returns 1 if the type already had a valid version tag or a new one was
+ * assigned, or 0 if a new tag could not be assigned.
+ */
+PyAPI_FUNC(int) PyUnstable_Type_AssignVersionTag(PyTypeObject *type);
+
+
+typedef enum {
+    PyRefTracer_CREATE = 0,
+    PyRefTracer_DESTROY = 1,
+} PyRefTracerEvent;
+
+typedef int (*PyRefTracer)(PyObject *, PyRefTracerEvent event, void *);
+PyAPI_FUNC(int) PyRefTracer_SetTracer(PyRefTracer tracer, void *data);
+PyAPI_FUNC(PyRefTracer) PyRefTracer_GetTracer(void**);
diff --git a/Include/cpython/objimpl.h b/Include/cpython/objimpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0c2ce286f13ce38c3308c17e1cdc2308bf76edf
--- /dev/null
+++ b/Include/cpython/objimpl.h
@@ -0,0 +1,104 @@
+#ifndef Py_CPYTHON_OBJIMPL_H
+#  error "this header file must not be included directly"
+#endif
+
+static inline size_t _PyObject_SIZE(PyTypeObject *type) {
+    return _Py_STATIC_CAST(size_t, type->tp_basicsize);
+}
+
+/* _PyObject_VAR_SIZE returns the number of bytes (as size_t) allocated for a
+   vrbl-size object with nitems items, exclusive of gc overhead (if any).  The
+   value is rounded up to the closest multiple of sizeof(void *), in order to
+   ensure that pointer fields at the end of the object are correctly aligned
+   for the platform (this is of special importance for subclasses of, e.g.,
+   str or int, so that pointers can be stored after the embedded data).
+
+   Note that there's no memory wastage in doing this, as malloc has to
+   return (at worst) pointer-aligned memory anyway.
+*/
+#if ((SIZEOF_VOID_P - 1) & SIZEOF_VOID_P) != 0
+#   error "_PyObject_VAR_SIZE requires SIZEOF_VOID_P be a power of 2"
+#endif
+
+static inline size_t _PyObject_VAR_SIZE(PyTypeObject *type, Py_ssize_t nitems) {
+    size_t size = _Py_STATIC_CAST(size_t, type->tp_basicsize);
+    size += _Py_STATIC_CAST(size_t, nitems) * _Py_STATIC_CAST(size_t, type->tp_itemsize);
+    return _Py_SIZE_ROUND_UP(size, SIZEOF_VOID_P);
+}
+
+
+/* This example code implements an object constructor with a custom
+   allocator, where PyObject_New is inlined, and shows the important
+   distinction between two steps (at least):
+       1) the actual allocation of the object storage;
+       2) the initialization of the Python specific fields
+      in this storage with PyObject_{Init, InitVar}.
+
+   PyObject *
+   YourObject_New(...)
+   {
+       PyObject *op;
+
+       op = (PyObject *) Your_Allocator(_PyObject_SIZE(YourTypeStruct));
+       if (op == NULL) {
+           return PyErr_NoMemory();
+       }
+
+       PyObject_Init(op, &YourTypeStruct);
+
+       op->ob_field = value;
+       ...
+       return op;
+   }
+
+   Note that in C++, the use of the new operator usually implies that
+   the 1st step is performed automatically for you, so in a C++ class
+   constructor you would start directly with PyObject_Init/InitVar. */
+
+
+typedef struct {
+    /* user context passed as the first argument to the 2 functions */
+    void *ctx;
+
+    /* allocate an arena of size bytes */
+    void* (*alloc) (void *ctx, size_t size);
+
+    /* free an arena */
+    void (*free) (void *ctx, void *ptr, size_t size);
+} PyObjectArenaAllocator;
+
+/* Get the arena allocator. */
+PyAPI_FUNC(void) PyObject_GetArenaAllocator(PyObjectArenaAllocator *allocator);
+
+/* Set the arena allocator. */
+PyAPI_FUNC(void) PyObject_SetArenaAllocator(PyObjectArenaAllocator *allocator);
+
+
+/* Test if an object implements the garbage collector protocol */
+PyAPI_FUNC(int) PyObject_IS_GC(PyObject *obj);
+
+
+// Test if a type supports weak references
+PyAPI_FUNC(int) PyType_SUPPORTS_WEAKREFS(PyTypeObject *type);
+
+PyAPI_FUNC(PyObject **) PyObject_GET_WEAKREFS_LISTPTR(PyObject *op);
+
+PyAPI_FUNC(PyObject *) PyUnstable_Object_GC_NewWithExtraData(PyTypeObject *,
+                                                             size_t);
+
+
+/* Visit all live GC-capable objects, similar to gc.get_objects(None). The
+ * supplied callback is called on every such object with the void* arg set
+ * to the supplied arg. Returning 0 from the callback ends iteration, returning
+ * 1 allows iteration to continue. Returning any other value may result in
+ * undefined behaviour.
+ *
+ * If new objects are (de)allocated by the callback it is undefined if they
+ * will be visited.
+
+ * Garbage collection is disabled during operation. Explicitly running a
+ * collection in the callback may lead to undefined behaviour e.g. visiting the
+ * same objects multiple times or not at all.
+ */
+typedef int (*gcvisitobjects_t)(PyObject*, void*);
+PyAPI_FUNC(void) PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void* arg);
diff --git a/Include/cpython/odictobject.h b/Include/cpython/odictobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..3822d554868c10e3910f13aad4272f2973726eed
--- /dev/null
+++ b/Include/cpython/odictobject.h
@@ -0,0 +1,43 @@
+#ifndef Py_ODICTOBJECT_H
+#define Py_ODICTOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* OrderedDict */
+/* This API is optional and mostly redundant. */
+
+#ifndef Py_LIMITED_API
+
+typedef struct _odictobject PyODictObject;
+
+PyAPI_DATA(PyTypeObject) PyODict_Type;
+PyAPI_DATA(PyTypeObject) PyODictIter_Type;
+PyAPI_DATA(PyTypeObject) PyODictKeys_Type;
+PyAPI_DATA(PyTypeObject) PyODictItems_Type;
+PyAPI_DATA(PyTypeObject) PyODictValues_Type;
+
+#define PyODict_Check(op) PyObject_TypeCheck((op), &PyODict_Type)
+#define PyODict_CheckExact(op) Py_IS_TYPE((op), &PyODict_Type)
+#define PyODict_SIZE(op) PyDict_GET_SIZE((op))
+
+PyAPI_FUNC(PyObject *) PyODict_New(void);
+PyAPI_FUNC(int) PyODict_SetItem(PyObject *od, PyObject *key, PyObject *item);
+PyAPI_FUNC(int) PyODict_DelItem(PyObject *od, PyObject *key);
+
+/* wrappers around PyDict* functions */
+#define PyODict_GetItem(od, key) PyDict_GetItem(_PyObject_CAST(od), (key))
+#define PyODict_GetItemWithError(od, key) \
+    PyDict_GetItemWithError(_PyObject_CAST(od), (key))
+#define PyODict_Contains(od, key) PyDict_Contains(_PyObject_CAST(od), (key))
+#define PyODict_Size(od) PyDict_Size(_PyObject_CAST(od))
+#define PyODict_GetItemString(od, key) \
+    PyDict_GetItemString(_PyObject_CAST(od), (key))
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_ODICTOBJECT_H */
diff --git a/Include/cpython/picklebufobject.h b/Include/cpython/picklebufobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3cbaeef919518cd3154ce6cd4634210aebf42de
--- /dev/null
+++ b/Include/cpython/picklebufobject.h
@@ -0,0 +1,31 @@
+/* PickleBuffer object. This is built-in for ease of use from third-party
+ * C extensions.
+ */
+
+#ifndef Py_PICKLEBUFOBJECT_H
+#define Py_PICKLEBUFOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_LIMITED_API
+
+PyAPI_DATA(PyTypeObject) PyPickleBuffer_Type;
+
+#define PyPickleBuffer_Check(op) Py_IS_TYPE((op), &PyPickleBuffer_Type)
+
+/* Create a PickleBuffer redirecting to the given buffer-enabled object */
+PyAPI_FUNC(PyObject *) PyPickleBuffer_FromObject(PyObject *);
+/* Get the PickleBuffer's underlying view to the original object
+ * (NULL if released)
+ */
+PyAPI_FUNC(const Py_buffer *) PyPickleBuffer_GetBuffer(PyObject *);
+/* Release the PickleBuffer.  Returns 0 on success, -1 on error. */
+PyAPI_FUNC(int) PyPickleBuffer_Release(PyObject *);
+
+#endif /* !Py_LIMITED_API */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PICKLEBUFOBJECT_H */
diff --git a/Include/cpython/pthread_stubs.h b/Include/cpython/pthread_stubs.h
new file mode 100644
index 0000000000000000000000000000000000000000..e542eaa5bff0cf0322441f18e05da6dba35ab417
--- /dev/null
+++ b/Include/cpython/pthread_stubs.h
@@ -0,0 +1,105 @@
+#ifndef Py_CPYTHON_PTRHEAD_STUBS_H
+#define Py_CPYTHON_PTRHEAD_STUBS_H
+
+#if !defined(HAVE_PTHREAD_STUBS)
+#  error "this header file requires stubbed pthreads."
+#endif
+
+#ifndef _POSIX_THREADS
+#  define _POSIX_THREADS 1
+#endif
+
+/* Minimal pthread stubs for CPython.
+ *
+ * The stubs implement the minimum pthread API for CPython.
+ * - pthread_create() fails.
+ * - pthread_exit() calls exit(0).
+ * - pthread_key_*() functions implement minimal TSS without destructor.
+ * - all other functions do nothing and return 0.
+ */
+
+#ifdef __wasi__
+// WASI's bits/alltypes.h provides type definitions when __NEED_ is set.
+// The header file can be included multiple times.
+//
+// <sys/types.h> may also define these macros.
+#  ifndef __NEED_pthread_cond_t
+#    define __NEED_pthread_cond_t 1
+#  endif
+#  ifndef __NEED_pthread_condattr_t
+#    define __NEED_pthread_condattr_t 1
+#  endif
+#  ifndef __NEED_pthread_mutex_t
+#    define __NEED_pthread_mutex_t 1
+#  endif
+#  ifndef __NEED_pthread_mutexattr_t
+#    define __NEED_pthread_mutexattr_t 1
+#  endif
+#  ifndef __NEED_pthread_key_t
+#    define __NEED_pthread_key_t 1
+#  endif
+#  ifndef __NEED_pthread_t
+#    define __NEED_pthread_t 1
+#  endif
+#  ifndef __NEED_pthread_attr_t
+#    define __NEED_pthread_attr_t 1
+#  endif
+#  include <bits/alltypes.h>
+#else
+typedef struct { void *__x; } pthread_cond_t;
+typedef struct { unsigned __attr; } pthread_condattr_t;
+typedef struct { void *__x; } pthread_mutex_t;
+typedef struct { unsigned __attr; } pthread_mutexattr_t;
+typedef unsigned pthread_key_t;
+typedef unsigned pthread_t;
+typedef struct { unsigned __attr; } pthread_attr_t;
+#endif
+
+// mutex
+PyAPI_FUNC(int) pthread_mutex_init(pthread_mutex_t *restrict mutex,
+                                   const pthread_mutexattr_t *restrict attr);
+PyAPI_FUNC(int) pthread_mutex_destroy(pthread_mutex_t *mutex);
+PyAPI_FUNC(int) pthread_mutex_trylock(pthread_mutex_t *mutex);
+PyAPI_FUNC(int) pthread_mutex_lock(pthread_mutex_t *mutex);
+PyAPI_FUNC(int) pthread_mutex_unlock(pthread_mutex_t *mutex);
+
+// condition
+PyAPI_FUNC(int) pthread_cond_init(pthread_cond_t *restrict cond,
+                                  const pthread_condattr_t *restrict attr);
+PyAPI_FUNC(int) pthread_cond_destroy(pthread_cond_t *cond);
+PyAPI_FUNC(int) pthread_cond_wait(pthread_cond_t *restrict cond,
+                                  pthread_mutex_t *restrict mutex);
+PyAPI_FUNC(int) pthread_cond_timedwait(pthread_cond_t *restrict cond,
+                                       pthread_mutex_t *restrict mutex,
+                                       const struct timespec *restrict abstime);
+PyAPI_FUNC(int) pthread_cond_signal(pthread_cond_t *cond);
+PyAPI_FUNC(int) pthread_condattr_init(pthread_condattr_t *attr);
+PyAPI_FUNC(int) pthread_condattr_setclock(
+    pthread_condattr_t *attr, clockid_t clock_id);
+
+// pthread
+PyAPI_FUNC(int) pthread_create(pthread_t *restrict thread,
+                               const pthread_attr_t *restrict attr,
+                               void *(*start_routine)(void *),
+                               void *restrict arg);
+PyAPI_FUNC(int) pthread_detach(pthread_t thread);
+PyAPI_FUNC(int) pthread_join(pthread_t thread, void** value_ptr);
+PyAPI_FUNC(pthread_t) pthread_self(void);
+PyAPI_FUNC(int) pthread_exit(void *retval) __attribute__ ((__noreturn__));
+PyAPI_FUNC(int) pthread_attr_init(pthread_attr_t *attr);
+PyAPI_FUNC(int) pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize);
+PyAPI_FUNC(int) pthread_attr_destroy(pthread_attr_t *attr);
+
+
+// pthread_key
+#ifndef PTHREAD_KEYS_MAX
+#  define PTHREAD_KEYS_MAX 128
+#endif
+
+PyAPI_FUNC(int) pthread_key_create(pthread_key_t *key,
+                                   void (*destr_function)(void *));
+PyAPI_FUNC(int) pthread_key_delete(pthread_key_t key);
+PyAPI_FUNC(void *) pthread_getspecific(pthread_key_t key);
+PyAPI_FUNC(int) pthread_setspecific(pthread_key_t key, const void *value);
+
+#endif // Py_CPYTHON_PTRHEAD_STUBS_H
diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h
new file mode 100644
index 0000000000000000000000000000000000000000..28029859d3df2b826c27812fc845ba3016b884c0
--- /dev/null
+++ b/Include/cpython/pyatomic.h
@@ -0,0 +1,569 @@
+// This header provides cross-platform low-level atomic operations
+// similar to C11 atomics.
+//
+// Operations are sequentially consistent unless they have a suffix indicating
+// otherwise. If in doubt, prefer the sequentially consistent operations.
+//
+// The "_relaxed" suffix for load and store operations indicates the "relaxed"
+// memory order. They don't provide synchronization, but (roughly speaking)
+// guarantee somewhat sane behavior for races instead of undefined behavior.
+// In practice, they correspond to "normal" hardware load and store
+// instructions, so they are almost as inexpensive as plain loads and stores
+// in C.
+//
+// Note that atomic read-modify-write operations like _Py_atomic_add_* return
+// the previous value of the atomic variable, not the new value.
+//
+// See https://en.cppreference.com/w/c/atomic for more information on C11
+// atomics.
+// See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+// "A Relaxed Guide to memory_order_relaxed" for discussion of and common usage
+// or relaxed atomics.
+//
+// Functions with pseudo Python code:
+//
+//   def _Py_atomic_load(obj):
+//        return obj  # sequential consistency
+//
+//   def _Py_atomic_load_relaxed(obj):
+//       return obj  # relaxed consistency
+//
+//   def _Py_atomic_store(obj, value):
+//       obj = value  # sequential consistency
+//
+//   def _Py_atomic_store_relaxed(obj, value):
+//       obj = value  # relaxed consistency
+//
+//   def _Py_atomic_exchange(obj, value):
+//       # sequential consistency
+//       old_obj = obj
+//       obj = value
+//       return old_obj
+//
+//   def _Py_atomic_compare_exchange(obj, expected, desired):
+//       # sequential consistency
+//       if obj == expected:
+//           obj = desired
+//           return True
+//       else:
+//           expected = obj
+//           return False
+//
+//   def _Py_atomic_add(obj, value):
+//       # sequential consistency
+//       old_obj = obj
+//       obj += value
+//       return old_obj
+//
+//   def _Py_atomic_and(obj, value):
+//       # sequential consistency
+//       old_obj = obj
+//       obj &= value
+//       return old_obj
+//
+//   def _Py_atomic_or(obj, value):
+//       # sequential consistency
+//       old_obj = obj
+//       obj |= value
+//       return old_obj
+//
+// Other functions:
+//
+//   def _Py_atomic_load_ptr_acquire(obj):
+//       return obj  # acquire
+//
+//   def _Py_atomic_store_ptr_release(obj):
+//       return obj  # release
+//
+//   def _Py_atomic_fence_seq_cst():
+//       # sequential consistency
+//       ...
+//
+//   def _Py_atomic_fence_release():
+//       # release
+//       ...
+
+#ifndef Py_CPYTHON_ATOMIC_H
+#  error "this header file must not be included directly"
+#endif
+
+// --- _Py_atomic_add --------------------------------------------------------
+// Atomically adds `value` to `obj` and returns the previous value
+
+static inline int
+_Py_atomic_add_int(int *obj, int value);
+
+static inline int8_t
+_Py_atomic_add_int8(int8_t *obj, int8_t value);
+
+static inline int16_t
+_Py_atomic_add_int16(int16_t *obj, int16_t value);
+
+static inline int32_t
+_Py_atomic_add_int32(int32_t *obj, int32_t value);
+
+static inline int64_t
+_Py_atomic_add_int64(int64_t *obj, int64_t value);
+
+static inline intptr_t
+_Py_atomic_add_intptr(intptr_t *obj, intptr_t value);
+
+static inline unsigned int
+_Py_atomic_add_uint(unsigned int *obj, unsigned int value);
+
+static inline uint8_t
+_Py_atomic_add_uint8(uint8_t *obj, uint8_t value);
+
+static inline uint16_t
+_Py_atomic_add_uint16(uint16_t *obj, uint16_t value);
+
+static inline uint32_t
+_Py_atomic_add_uint32(uint32_t *obj, uint32_t value);
+
+static inline uint64_t
+_Py_atomic_add_uint64(uint64_t *obj, uint64_t value);
+
+static inline uintptr_t
+_Py_atomic_add_uintptr(uintptr_t *obj, uintptr_t value);
+
+static inline Py_ssize_t
+_Py_atomic_add_ssize(Py_ssize_t *obj, Py_ssize_t value);
+
+
+// --- _Py_atomic_compare_exchange -------------------------------------------
+// Performs an atomic compare-and-exchange.
+//
+// - If `*obj` and `*expected` are equal, store `desired` into `*obj`
+//   and return 1 (success).
+// - Otherwise, store the `*obj` current value into `*expected`
+//   and return 0 (failure).
+//
+// These correspond to the C11 atomic_compare_exchange_strong() function.
+
+static inline int
+_Py_atomic_compare_exchange_int(int *obj, int *expected, int desired);
+
+static inline int
+_Py_atomic_compare_exchange_int8(int8_t *obj, int8_t *expected, int8_t desired);
+
+static inline int
+_Py_atomic_compare_exchange_int16(int16_t *obj, int16_t *expected, int16_t desired);
+
+static inline int
+_Py_atomic_compare_exchange_int32(int32_t *obj, int32_t *expected, int32_t desired);
+
+static inline int
+_Py_atomic_compare_exchange_int64(int64_t *obj, int64_t *expected, int64_t desired);
+
+static inline int
+_Py_atomic_compare_exchange_intptr(intptr_t *obj, intptr_t *expected, intptr_t desired);
+
+static inline int
+_Py_atomic_compare_exchange_uint(unsigned int *obj, unsigned int *expected, unsigned int desired);
+
+static inline int
+_Py_atomic_compare_exchange_uint8(uint8_t *obj, uint8_t *expected, uint8_t desired);
+
+static inline int
+_Py_atomic_compare_exchange_uint16(uint16_t *obj, uint16_t *expected, uint16_t desired);
+
+static inline int
+_Py_atomic_compare_exchange_uint32(uint32_t *obj, uint32_t *expected, uint32_t desired);
+
+static inline int
+_Py_atomic_compare_exchange_uint64(uint64_t *obj, uint64_t *expected, uint64_t desired);
+
+static inline int
+_Py_atomic_compare_exchange_uintptr(uintptr_t *obj, uintptr_t *expected, uintptr_t desired);
+
+static inline int
+_Py_atomic_compare_exchange_ssize(Py_ssize_t *obj, Py_ssize_t *expected, Py_ssize_t desired);
+
+// NOTE: `obj` and `expected` are logically `void**` types, but we use `void*`
+// so that we can pass types like `PyObject**` without a cast.
+static inline int
+_Py_atomic_compare_exchange_ptr(void *obj, void *expected, void *value);
+
+
+// --- _Py_atomic_exchange ---------------------------------------------------
+// Atomically replaces `*obj` with `value` and returns the previous value of `*obj`.
+
+static inline int
+_Py_atomic_exchange_int(int *obj, int value);
+
+static inline int8_t
+_Py_atomic_exchange_int8(int8_t *obj, int8_t value);
+
+static inline int16_t
+_Py_atomic_exchange_int16(int16_t *obj, int16_t value);
+
+static inline int32_t
+_Py_atomic_exchange_int32(int32_t *obj, int32_t value);
+
+static inline int64_t
+_Py_atomic_exchange_int64(int64_t *obj, int64_t value);
+
+static inline intptr_t
+_Py_atomic_exchange_intptr(intptr_t *obj, intptr_t value);
+
+static inline unsigned int
+_Py_atomic_exchange_uint(unsigned int *obj, unsigned int value);
+
+static inline uint8_t
+_Py_atomic_exchange_uint8(uint8_t *obj, uint8_t value);
+
+static inline uint16_t
+_Py_atomic_exchange_uint16(uint16_t *obj, uint16_t value);
+
+static inline uint32_t
+_Py_atomic_exchange_uint32(uint32_t *obj, uint32_t value);
+
+static inline uint64_t
+_Py_atomic_exchange_uint64(uint64_t *obj, uint64_t value);
+
+static inline uintptr_t
+_Py_atomic_exchange_uintptr(uintptr_t *obj, uintptr_t value);
+
+static inline Py_ssize_t
+_Py_atomic_exchange_ssize(Py_ssize_t *obj, Py_ssize_t value);
+
+static inline void *
+_Py_atomic_exchange_ptr(void *obj, void *value);
+
+
+// --- _Py_atomic_and --------------------------------------------------------
+// Performs `*obj &= value` atomically and returns the previous value of `*obj`.
+
+static inline uint8_t
+_Py_atomic_and_uint8(uint8_t *obj, uint8_t value);
+
+static inline uint16_t
+_Py_atomic_and_uint16(uint16_t *obj, uint16_t value);
+
+static inline uint32_t
+_Py_atomic_and_uint32(uint32_t *obj, uint32_t value);
+
+static inline uint64_t
+_Py_atomic_and_uint64(uint64_t *obj, uint64_t value);
+
+static inline uintptr_t
+_Py_atomic_and_uintptr(uintptr_t *obj, uintptr_t value);
+
+
+// --- _Py_atomic_or ---------------------------------------------------------
+// Performs `*obj |= value` atomically and returns the previous value of `*obj`.
+
+static inline uint8_t
+_Py_atomic_or_uint8(uint8_t *obj, uint8_t value);
+
+static inline uint16_t
+_Py_atomic_or_uint16(uint16_t *obj, uint16_t value);
+
+static inline uint32_t
+_Py_atomic_or_uint32(uint32_t *obj, uint32_t value);
+
+static inline uint64_t
+_Py_atomic_or_uint64(uint64_t *obj, uint64_t value);
+
+static inline uintptr_t
+_Py_atomic_or_uintptr(uintptr_t *obj, uintptr_t value);
+
+
+// --- _Py_atomic_load -------------------------------------------------------
+// Atomically loads `*obj` (sequential consistency)
+
+static inline int
+_Py_atomic_load_int(const int *obj);
+
+static inline int8_t
+_Py_atomic_load_int8(const int8_t *obj);
+
+static inline int16_t
+_Py_atomic_load_int16(const int16_t *obj);
+
+static inline int32_t
+_Py_atomic_load_int32(const int32_t *obj);
+
+static inline int64_t
+_Py_atomic_load_int64(const int64_t *obj);
+
+static inline intptr_t
+_Py_atomic_load_intptr(const intptr_t *obj);
+
+static inline uint8_t
+_Py_atomic_load_uint8(const uint8_t *obj);
+
+static inline uint16_t
+_Py_atomic_load_uint16(const uint16_t *obj);
+
+static inline uint32_t
+_Py_atomic_load_uint32(const uint32_t *obj);
+
+static inline uint64_t
+_Py_atomic_load_uint64(const uint64_t *obj);
+
+static inline uintptr_t
+_Py_atomic_load_uintptr(const uintptr_t *obj);
+
+static inline unsigned int
+_Py_atomic_load_uint(const unsigned int *obj);
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize(const Py_ssize_t *obj);
+
+static inline void *
+_Py_atomic_load_ptr(const void *obj);
+
+
+// --- _Py_atomic_load_relaxed -----------------------------------------------
+// Loads `*obj` (relaxed consistency, i.e., no ordering)
+
+static inline int
+_Py_atomic_load_int_relaxed(const int *obj);
+
+static inline int8_t
+_Py_atomic_load_int8_relaxed(const int8_t *obj);
+
+static inline int16_t
+_Py_atomic_load_int16_relaxed(const int16_t *obj);
+
+static inline int32_t
+_Py_atomic_load_int32_relaxed(const int32_t *obj);
+
+static inline int64_t
+_Py_atomic_load_int64_relaxed(const int64_t *obj);
+
+static inline intptr_t
+_Py_atomic_load_intptr_relaxed(const intptr_t *obj);
+
+static inline uint8_t
+_Py_atomic_load_uint8_relaxed(const uint8_t *obj);
+
+static inline uint16_t
+_Py_atomic_load_uint16_relaxed(const uint16_t *obj);
+
+static inline uint32_t
+_Py_atomic_load_uint32_relaxed(const uint32_t *obj);
+
+static inline uint64_t
+_Py_atomic_load_uint64_relaxed(const uint64_t *obj);
+
+static inline uintptr_t
+_Py_atomic_load_uintptr_relaxed(const uintptr_t *obj);
+
+static inline unsigned int
+_Py_atomic_load_uint_relaxed(const unsigned int *obj);
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize_relaxed(const Py_ssize_t *obj);
+
+static inline void *
+_Py_atomic_load_ptr_relaxed(const void *obj);
+
+static inline unsigned long long
+_Py_atomic_load_ullong_relaxed(const unsigned long long *obj);
+
+// --- _Py_atomic_store ------------------------------------------------------
+// Atomically performs `*obj = value` (sequential consistency)
+
+static inline void
+_Py_atomic_store_int(int *obj, int value);
+
+static inline void
+_Py_atomic_store_int8(int8_t *obj, int8_t value);
+
+static inline void
+_Py_atomic_store_int16(int16_t *obj, int16_t value);
+
+static inline void
+_Py_atomic_store_int32(int32_t *obj, int32_t value);
+
+static inline void
+_Py_atomic_store_int64(int64_t *obj, int64_t value);
+
+static inline void
+_Py_atomic_store_intptr(intptr_t *obj, intptr_t value);
+
+static inline void
+_Py_atomic_store_uint8(uint8_t *obj, uint8_t value);
+
+static inline void
+_Py_atomic_store_uint16(uint16_t *obj, uint16_t value);
+
+static inline void
+_Py_atomic_store_uint32(uint32_t *obj, uint32_t value);
+
+static inline void
+_Py_atomic_store_uint64(uint64_t *obj, uint64_t value);
+
+static inline void
+_Py_atomic_store_uintptr(uintptr_t *obj, uintptr_t value);
+
+static inline void
+_Py_atomic_store_uint(unsigned int *obj, unsigned int value);
+
+static inline void
+_Py_atomic_store_ptr(void *obj, void *value);
+
+static inline void
+_Py_atomic_store_ssize(Py_ssize_t* obj, Py_ssize_t value);
+
+
+// --- _Py_atomic_store_relaxed ----------------------------------------------
+// Stores `*obj = value` (relaxed consistency, i.e., no ordering)
+
+static inline void
+_Py_atomic_store_int_relaxed(int *obj, int value);
+
+static inline void
+_Py_atomic_store_int8_relaxed(int8_t *obj, int8_t value);
+
+static inline void
+_Py_atomic_store_int16_relaxed(int16_t *obj, int16_t value);
+
+static inline void
+_Py_atomic_store_int32_relaxed(int32_t *obj, int32_t value);
+
+static inline void
+_Py_atomic_store_int64_relaxed(int64_t *obj, int64_t value);
+
+static inline void
+_Py_atomic_store_intptr_relaxed(intptr_t *obj, intptr_t value);
+
+static inline void
+_Py_atomic_store_uint8_relaxed(uint8_t* obj, uint8_t value);
+
+static inline void
+_Py_atomic_store_uint16_relaxed(uint16_t *obj, uint16_t value);
+
+static inline void
+_Py_atomic_store_uint32_relaxed(uint32_t *obj, uint32_t value);
+
+static inline void
+_Py_atomic_store_uint64_relaxed(uint64_t *obj, uint64_t value);
+
+static inline void
+_Py_atomic_store_uintptr_relaxed(uintptr_t *obj, uintptr_t value);
+
+static inline void
+_Py_atomic_store_uint_relaxed(unsigned int *obj, unsigned int value);
+
+static inline void
+_Py_atomic_store_ptr_relaxed(void *obj, void *value);
+
+static inline void
+_Py_atomic_store_ssize_relaxed(Py_ssize_t *obj, Py_ssize_t value);
+
+static inline void
+_Py_atomic_store_ullong_relaxed(unsigned long long *obj,
+                                unsigned long long value);
+
+
+// --- _Py_atomic_load_ptr_acquire / _Py_atomic_store_ptr_release ------------
+
+// Loads `*obj` (acquire operation)
+static inline void *
+_Py_atomic_load_ptr_acquire(const void *obj);
+
+static inline uintptr_t
+_Py_atomic_load_uintptr_acquire(const uintptr_t *obj);
+
+// Stores `*obj = value` (release operation)
+static inline void
+_Py_atomic_store_ptr_release(void *obj, void *value);
+
+static inline void
+_Py_atomic_store_uintptr_release(uintptr_t *obj, uintptr_t value);
+
+static inline void
+_Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value);
+
+static inline void
+_Py_atomic_store_int_release(int *obj, int value);
+
+static inline int
+_Py_atomic_load_int_acquire(const int *obj);
+
+static inline void
+_Py_atomic_store_uint32_release(uint32_t *obj, uint32_t value);
+
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value);
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj);
+
+static inline uint32_t
+_Py_atomic_load_uint32_acquire(const uint32_t *obj);
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize_acquire(const Py_ssize_t *obj);
+
+
+
+
+// --- _Py_atomic_fence ------------------------------------------------------
+
+// Sequential consistency fence. C11 fences have complex semantics. When
+// possible, use the atomic operations on variables defined above, which
+// generally do not require explicit use of a fence.
+// See https://en.cppreference.com/w/cpp/atomic/atomic_thread_fence
+static inline void _Py_atomic_fence_seq_cst(void);
+
+// Acquire fence
+static inline void _Py_atomic_fence_acquire(void);
+
+// Release fence
+static inline void _Py_atomic_fence_release(void);
+
+
+#ifndef _Py_USE_GCC_BUILTIN_ATOMICS
+#  if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+#    define _Py_USE_GCC_BUILTIN_ATOMICS 1
+#  elif defined(__clang__)
+#    if __has_builtin(__atomic_load)
+#      define _Py_USE_GCC_BUILTIN_ATOMICS 1
+#    endif
+#  endif
+#endif
+
+#if _Py_USE_GCC_BUILTIN_ATOMICS
+#  define Py_ATOMIC_GCC_H
+#  include "pyatomic_gcc.h"
+#  undef Py_ATOMIC_GCC_H
+#elif __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__)
+#  define Py_ATOMIC_STD_H
+#  include "pyatomic_std.h"
+#  undef Py_ATOMIC_STD_H
+#elif defined(_MSC_VER)
+#  define Py_ATOMIC_MSC_H
+#  include "pyatomic_msc.h"
+#  undef Py_ATOMIC_MSC_H
+#else
+#  error "no available pyatomic implementation for this platform/compiler"
+#endif
+
+
+// --- aliases ---------------------------------------------------------------
+
+#if SIZEOF_LONG == 8
+# define _Py_atomic_load_ulong(p) \
+    _Py_atomic_load_uint64((uint64_t *)p)
+# define _Py_atomic_load_ulong_relaxed(p) \
+    _Py_atomic_load_uint64_relaxed((uint64_t *)p)
+# define _Py_atomic_store_ulong(p, v) \
+    _Py_atomic_store_uint64((uint64_t *)p, v)
+# define _Py_atomic_store_ulong_relaxed(p, v) \
+    _Py_atomic_store_uint64_relaxed((uint64_t *)p, v)
+#elif SIZEOF_LONG == 4
+# define _Py_atomic_load_ulong(p) \
+    _Py_atomic_load_uint32((uint32_t *)p)
+# define _Py_atomic_load_ulong_relaxed(p) \
+    _Py_atomic_load_uint32_relaxed((uint32_t *)p)
+# define _Py_atomic_store_ulong(p, v) \
+    _Py_atomic_store_uint32((uint32_t *)p, v)
+# define _Py_atomic_store_ulong_relaxed(p, v) \
+    _Py_atomic_store_uint32_relaxed((uint32_t *)p, v)
+#else
+# error "long must be 4 or 8 bytes in size"
+#endif  // SIZEOF_LONG
diff --git a/Include/cpython/pyatomic_gcc.h b/Include/cpython/pyatomic_gcc.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef09954d53ac1d61819856ed5c59a1fca1f1da28
--- /dev/null
+++ b/Include/cpython/pyatomic_gcc.h
@@ -0,0 +1,551 @@
+// This is the implementation of Python atomic operations using GCC's built-in
+// functions that match the C+11 memory model. This implementation is preferred
+// for GCC compatible compilers, such as Clang. These functions are available
+// in GCC 4.8+ without needing to compile with --std=c11 or --std=gnu11.
+
+#ifndef Py_ATOMIC_GCC_H
+#  error "this header file must not be included directly"
+#endif
+
+
+// --- _Py_atomic_add --------------------------------------------------------
+
+static inline int
+_Py_atomic_add_int(int *obj, int value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline int8_t
+_Py_atomic_add_int8(int8_t *obj, int8_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline int16_t
+_Py_atomic_add_int16(int16_t *obj, int16_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline int32_t
+_Py_atomic_add_int32(int32_t *obj, int32_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline int64_t
+_Py_atomic_add_int64(int64_t *obj, int64_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline intptr_t
+_Py_atomic_add_intptr(intptr_t *obj, intptr_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline unsigned int
+_Py_atomic_add_uint(unsigned int *obj, unsigned int value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint8_t
+_Py_atomic_add_uint8(uint8_t *obj, uint8_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint16_t
+_Py_atomic_add_uint16(uint16_t *obj, uint16_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint32_t
+_Py_atomic_add_uint32(uint32_t *obj, uint32_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint64_t
+_Py_atomic_add_uint64(uint64_t *obj, uint64_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uintptr_t
+_Py_atomic_add_uintptr(uintptr_t *obj, uintptr_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline Py_ssize_t
+_Py_atomic_add_ssize(Py_ssize_t *obj, Py_ssize_t value)
+{ return __atomic_fetch_add(obj, value, __ATOMIC_SEQ_CST); }
+
+
+// --- _Py_atomic_compare_exchange -------------------------------------------
+
+static inline int
+_Py_atomic_compare_exchange_int(int *obj, int *expected, int desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_int8(int8_t *obj, int8_t *expected, int8_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_int16(int16_t *obj, int16_t *expected, int16_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_int32(int32_t *obj, int32_t *expected, int32_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_int64(int64_t *obj, int64_t *expected, int64_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_intptr(intptr_t *obj, intptr_t *expected, intptr_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_uint(unsigned int *obj, unsigned int *expected, unsigned int desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_uint8(uint8_t *obj, uint8_t *expected, uint8_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_uint16(uint16_t *obj, uint16_t *expected, uint16_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_uint32(uint32_t *obj, uint32_t *expected, uint32_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_uint64(uint64_t *obj, uint64_t *expected, uint64_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_uintptr(uintptr_t *obj, uintptr_t *expected, uintptr_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_ssize(Py_ssize_t *obj, Py_ssize_t *expected, Py_ssize_t desired)
+{ return __atomic_compare_exchange_n(obj, expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+static inline int
+_Py_atomic_compare_exchange_ptr(void *obj, void *expected, void *desired)
+{ return __atomic_compare_exchange_n((void **)obj, (void **)expected, desired, 0,
+                                     __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); }
+
+
+// --- _Py_atomic_exchange ---------------------------------------------------
+
+static inline int
+_Py_atomic_exchange_int(int *obj, int value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline int8_t
+_Py_atomic_exchange_int8(int8_t *obj, int8_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline int16_t
+_Py_atomic_exchange_int16(int16_t *obj, int16_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline int32_t
+_Py_atomic_exchange_int32(int32_t *obj, int32_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline int64_t
+_Py_atomic_exchange_int64(int64_t *obj, int64_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline intptr_t
+_Py_atomic_exchange_intptr(intptr_t *obj, intptr_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline unsigned int
+_Py_atomic_exchange_uint(unsigned int *obj, unsigned int value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint8_t
+_Py_atomic_exchange_uint8(uint8_t *obj, uint8_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint16_t
+_Py_atomic_exchange_uint16(uint16_t *obj, uint16_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint32_t
+_Py_atomic_exchange_uint32(uint32_t *obj, uint32_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint64_t
+_Py_atomic_exchange_uint64(uint64_t *obj, uint64_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uintptr_t
+_Py_atomic_exchange_uintptr(uintptr_t *obj, uintptr_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline Py_ssize_t
+_Py_atomic_exchange_ssize(Py_ssize_t *obj, Py_ssize_t value)
+{ return __atomic_exchange_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void *
+_Py_atomic_exchange_ptr(void *obj, void *value)
+{ return __atomic_exchange_n((void **)obj, value, __ATOMIC_SEQ_CST); }
+
+
+// --- _Py_atomic_and --------------------------------------------------------
+
+static inline uint8_t
+_Py_atomic_and_uint8(uint8_t *obj, uint8_t value)
+{ return __atomic_fetch_and(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint16_t
+_Py_atomic_and_uint16(uint16_t *obj, uint16_t value)
+{ return __atomic_fetch_and(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint32_t
+_Py_atomic_and_uint32(uint32_t *obj, uint32_t value)
+{ return __atomic_fetch_and(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint64_t
+_Py_atomic_and_uint64(uint64_t *obj, uint64_t value)
+{ return __atomic_fetch_and(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uintptr_t
+_Py_atomic_and_uintptr(uintptr_t *obj, uintptr_t value)
+{ return __atomic_fetch_and(obj, value, __ATOMIC_SEQ_CST); }
+
+
+// --- _Py_atomic_or ---------------------------------------------------------
+
+static inline uint8_t
+_Py_atomic_or_uint8(uint8_t *obj, uint8_t value)
+{ return __atomic_fetch_or(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint16_t
+_Py_atomic_or_uint16(uint16_t *obj, uint16_t value)
+{ return __atomic_fetch_or(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint32_t
+_Py_atomic_or_uint32(uint32_t *obj, uint32_t value)
+{ return __atomic_fetch_or(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uint64_t
+_Py_atomic_or_uint64(uint64_t *obj, uint64_t value)
+{ return __atomic_fetch_or(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline uintptr_t
+_Py_atomic_or_uintptr(uintptr_t *obj, uintptr_t value)
+{ return __atomic_fetch_or(obj, value, __ATOMIC_SEQ_CST); }
+
+
+// --- _Py_atomic_load -------------------------------------------------------
+
+static inline int
+_Py_atomic_load_int(const int *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline int8_t
+_Py_atomic_load_int8(const int8_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline int16_t
+_Py_atomic_load_int16(const int16_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline int32_t
+_Py_atomic_load_int32(const int32_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline int64_t
+_Py_atomic_load_int64(const int64_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline intptr_t
+_Py_atomic_load_intptr(const intptr_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline uint8_t
+_Py_atomic_load_uint8(const uint8_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline uint16_t
+_Py_atomic_load_uint16(const uint16_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline uint32_t
+_Py_atomic_load_uint32(const uint32_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline uint64_t
+_Py_atomic_load_uint64(const uint64_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline uintptr_t
+_Py_atomic_load_uintptr(const uintptr_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline unsigned int
+_Py_atomic_load_uint(const unsigned int *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize(const Py_ssize_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_SEQ_CST); }
+
+static inline void *
+_Py_atomic_load_ptr(const void *obj)
+{ return (void *)__atomic_load_n((void * const *)obj, __ATOMIC_SEQ_CST); }
+
+
+// --- _Py_atomic_load_relaxed -----------------------------------------------
+
+static inline int
+_Py_atomic_load_int_relaxed(const int *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline int8_t
+_Py_atomic_load_int8_relaxed(const int8_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline int16_t
+_Py_atomic_load_int16_relaxed(const int16_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline int32_t
+_Py_atomic_load_int32_relaxed(const int32_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline int64_t
+_Py_atomic_load_int64_relaxed(const int64_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline intptr_t
+_Py_atomic_load_intptr_relaxed(const intptr_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline uint8_t
+_Py_atomic_load_uint8_relaxed(const uint8_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline uint16_t
+_Py_atomic_load_uint16_relaxed(const uint16_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline uint32_t
+_Py_atomic_load_uint32_relaxed(const uint32_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline uint64_t
+_Py_atomic_load_uint64_relaxed(const uint64_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline uintptr_t
+_Py_atomic_load_uintptr_relaxed(const uintptr_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline unsigned int
+_Py_atomic_load_uint_relaxed(const unsigned int *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize_relaxed(const Py_ssize_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+static inline void *
+_Py_atomic_load_ptr_relaxed(const void *obj)
+{ return (void *)__atomic_load_n((void * const *)obj, __ATOMIC_RELAXED); }
+
+static inline unsigned long long
+_Py_atomic_load_ullong_relaxed(const unsigned long long *obj)
+{ return __atomic_load_n(obj, __ATOMIC_RELAXED); }
+
+
+// --- _Py_atomic_store ------------------------------------------------------
+
+static inline void
+_Py_atomic_store_int(int *obj, int value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_int8(int8_t *obj, int8_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_int16(int16_t *obj, int16_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_int32(int32_t *obj, int32_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_int64(int64_t *obj, int64_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_intptr(intptr_t *obj, intptr_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_uint8(uint8_t *obj, uint8_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_uint16(uint16_t *obj, uint16_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_uint32(uint32_t *obj, uint32_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_uint64(uint64_t *obj, uint64_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_uintptr(uintptr_t *obj, uintptr_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_uint(unsigned int *obj, unsigned int value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_ptr(void *obj, void *value)
+{ __atomic_store_n((void **)obj, value, __ATOMIC_SEQ_CST); }
+
+static inline void
+_Py_atomic_store_ssize(Py_ssize_t *obj, Py_ssize_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_SEQ_CST); }
+
+
+// --- _Py_atomic_store_relaxed ----------------------------------------------
+
+static inline void
+_Py_atomic_store_int_relaxed(int *obj, int value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_int8_relaxed(int8_t *obj, int8_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_int16_relaxed(int16_t *obj, int16_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_int32_relaxed(int32_t *obj, int32_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_int64_relaxed(int64_t *obj, int64_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_intptr_relaxed(intptr_t *obj, intptr_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_uint8_relaxed(uint8_t *obj, uint8_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_uint16_relaxed(uint16_t *obj, uint16_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_uint32_relaxed(uint32_t *obj, uint32_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_uint64_relaxed(uint64_t *obj, uint64_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_uintptr_relaxed(uintptr_t *obj, uintptr_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_uint_relaxed(unsigned int *obj, unsigned int value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_ptr_relaxed(void *obj, void *value)
+{ __atomic_store_n((void **)obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_ssize_relaxed(Py_ssize_t *obj, Py_ssize_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+static inline void
+_Py_atomic_store_ullong_relaxed(unsigned long long *obj,
+                                unsigned long long value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELAXED); }
+
+
+// --- _Py_atomic_load_ptr_acquire / _Py_atomic_store_ptr_release ------------
+
+static inline void *
+_Py_atomic_load_ptr_acquire(const void *obj)
+{ return (void *)__atomic_load_n((void * const *)obj, __ATOMIC_ACQUIRE); }
+
+static inline uintptr_t
+_Py_atomic_load_uintptr_acquire(const uintptr_t *obj)
+{ return (uintptr_t)__atomic_load_n(obj, __ATOMIC_ACQUIRE); }
+
+static inline void
+_Py_atomic_store_ptr_release(void *obj, void *value)
+{ __atomic_store_n((void **)obj, value, __ATOMIC_RELEASE); }
+
+static inline void
+_Py_atomic_store_uintptr_release(uintptr_t *obj, uintptr_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
+
+static inline void
+_Py_atomic_store_int_release(int *obj, int value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
+
+static inline void
+_Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
+
+static inline int
+_Py_atomic_load_int_acquire(const int *obj)
+{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
+
+static inline void
+_Py_atomic_store_uint32_release(uint32_t *obj, uint32_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
+
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
+
+static inline uint32_t
+_Py_atomic_load_uint32_acquire(const uint32_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize_acquire(const Py_ssize_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
+
+// --- _Py_atomic_fence ------------------------------------------------------
+
+static inline void
+_Py_atomic_fence_seq_cst(void)
+{ __atomic_thread_fence(__ATOMIC_SEQ_CST); }
+
+ static inline void
+_Py_atomic_fence_acquire(void)
+{ __atomic_thread_fence(__ATOMIC_ACQUIRE); }
+
+ static inline void
+_Py_atomic_fence_release(void)
+{ __atomic_thread_fence(__ATOMIC_RELEASE); }
diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h
new file mode 100644
index 0000000000000000000000000000000000000000..84da21bdcbff4f1ed3bb549db4dfd7350b5dfffe
--- /dev/null
+++ b/Include/cpython/pyatomic_msc.h
@@ -0,0 +1,1095 @@
+// This is the implementation of Python atomic operations for MSVC if the
+// compiler does not support C11 or C++11 atomics.
+//
+// MSVC intrinsics are defined on char, short, long, __int64, and pointer
+// types. Note that long and int are both 32-bits even on 64-bit Windows,
+// so operations on int are cast to long.
+//
+// The volatile keyword has additional memory ordering semantics on MSVC. On
+// x86 and x86-64, volatile accesses have acquire-release semantics. On ARM64,
+// volatile accesses behave like C11's memory_order_relaxed.
+
+#ifndef Py_ATOMIC_MSC_H
+#  error "this header file must not be included directly"
+#endif
+
+#include <intrin.h>
+
+#define _Py_atomic_ASSERT_ARG_TYPE(TYPE) \
+    Py_BUILD_ASSERT(sizeof(*obj) == sizeof(TYPE))
+
+
+// --- _Py_atomic_add --------------------------------------------------------
+
+static inline int8_t
+_Py_atomic_add_int8(int8_t *obj, int8_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(char);
+    return (int8_t)_InterlockedExchangeAdd8((volatile char *)obj, (char)value);
+}
+
+static inline int16_t
+_Py_atomic_add_int16(int16_t *obj, int16_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(short);
+    return (int16_t)_InterlockedExchangeAdd16((volatile short *)obj, (short)value);
+}
+
+static inline int32_t
+_Py_atomic_add_int32(int32_t *obj, int32_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(long);
+    return (int32_t)_InterlockedExchangeAdd((volatile long *)obj, (long)value);
+}
+
+static inline int64_t
+_Py_atomic_add_int64(int64_t *obj, int64_t value)
+{
+#if defined(_M_X64) || defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(__int64);
+    return (int64_t)_InterlockedExchangeAdd64((volatile __int64 *)obj, (__int64)value);
+#else
+    int64_t old_value = _Py_atomic_load_int64_relaxed(obj);
+    for (;;) {
+        int64_t new_value = old_value + value;
+        if (_Py_atomic_compare_exchange_int64(obj, &old_value, new_value)) {
+            return old_value;
+        }
+    }
+#endif
+}
+
+
+static inline uint8_t
+_Py_atomic_add_uint8(uint8_t *obj, uint8_t value)
+{
+    return (uint8_t)_Py_atomic_add_int8((int8_t *)obj, (int8_t)value);
+}
+
+static inline uint16_t
+_Py_atomic_add_uint16(uint16_t *obj, uint16_t value)
+{
+    return (uint16_t)_Py_atomic_add_int16((int16_t *)obj, (int16_t)value);
+}
+
+static inline uint32_t
+_Py_atomic_add_uint32(uint32_t *obj, uint32_t value)
+{
+    return (uint32_t)_Py_atomic_add_int32((int32_t *)obj, (int32_t)value);
+}
+
+static inline int
+_Py_atomic_add_int(int *obj, int value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(int32_t);
+    return (int)_Py_atomic_add_int32((int32_t *)obj, (int32_t)value);
+}
+
+static inline unsigned int
+_Py_atomic_add_uint(unsigned int *obj, unsigned int value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(int32_t);
+    return (unsigned int)_Py_atomic_add_int32((int32_t *)obj, (int32_t)value);
+}
+
+static inline uint64_t
+_Py_atomic_add_uint64(uint64_t *obj, uint64_t value)
+{
+    return (uint64_t)_Py_atomic_add_int64((int64_t *)obj, (int64_t)value);
+}
+
+static inline intptr_t
+_Py_atomic_add_intptr(intptr_t *obj, intptr_t value)
+{
+#if SIZEOF_VOID_P == 8
+    _Py_atomic_ASSERT_ARG_TYPE(int64_t);
+    return (intptr_t)_Py_atomic_add_int64((int64_t *)obj, (int64_t)value);
+#else
+    _Py_atomic_ASSERT_ARG_TYPE(int32_t);
+    return (intptr_t)_Py_atomic_add_int32((int32_t *)obj, (int32_t)value);
+#endif
+}
+
+static inline uintptr_t
+_Py_atomic_add_uintptr(uintptr_t *obj, uintptr_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(intptr_t);
+    return (uintptr_t)_Py_atomic_add_intptr((intptr_t *)obj, (intptr_t)value);
+}
+
+static inline Py_ssize_t
+_Py_atomic_add_ssize(Py_ssize_t *obj, Py_ssize_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(intptr_t);
+    return (Py_ssize_t)_Py_atomic_add_intptr((intptr_t *)obj, (intptr_t)value);
+}
+
+
+// --- _Py_atomic_compare_exchange -------------------------------------------
+
+static inline int
+_Py_atomic_compare_exchange_int8(int8_t *obj, int8_t *expected, int8_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(char);
+    int8_t initial = (int8_t)_InterlockedCompareExchange8(
+                                       (volatile char *)obj,
+                                       (char)value,
+                                       (char)*expected);
+    if (initial == *expected) {
+        return 1;
+    }
+    *expected = initial;
+    return 0;
+}
+
+static inline int
+_Py_atomic_compare_exchange_int16(int16_t *obj, int16_t *expected, int16_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(short);
+    int16_t initial = (int16_t)_InterlockedCompareExchange16(
+                                       (volatile short *)obj,
+                                       (short)value,
+                                       (short)*expected);
+    if (initial == *expected) {
+        return 1;
+    }
+    *expected = initial;
+    return 0;
+}
+
+static inline int
+_Py_atomic_compare_exchange_int32(int32_t *obj, int32_t *expected, int32_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(long);
+    int32_t initial = (int32_t)_InterlockedCompareExchange(
+                                       (volatile long *)obj,
+                                       (long)value,
+                                       (long)*expected);
+    if (initial == *expected) {
+        return 1;
+    }
+    *expected = initial;
+    return 0;
+}
+
+static inline int
+_Py_atomic_compare_exchange_int64(int64_t *obj, int64_t *expected, int64_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(__int64);
+    int64_t initial = (int64_t)_InterlockedCompareExchange64(
+                                       (volatile __int64 *)obj,
+                                       (__int64)value,
+                                       (__int64)*expected);
+    if (initial == *expected) {
+        return 1;
+    }
+    *expected = initial;
+    return 0;
+}
+
+static inline int
+_Py_atomic_compare_exchange_ptr(void *obj, void *expected, void *value)
+{
+    void *initial = _InterlockedCompareExchangePointer(
+                                       (void**)obj,
+                                       value,
+                                       *(void**)expected);
+    if (initial == *(void**)expected) {
+        return 1;
+    }
+    *(void**)expected = initial;
+    return 0;
+}
+
+
+static inline int
+_Py_atomic_compare_exchange_uint8(uint8_t *obj, uint8_t *expected, uint8_t value)
+{
+    return _Py_atomic_compare_exchange_int8((int8_t *)obj,
+                                            (int8_t *)expected,
+                                            (int8_t)value);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uint16(uint16_t *obj, uint16_t *expected, uint16_t value)
+{
+    return _Py_atomic_compare_exchange_int16((int16_t *)obj,
+                                             (int16_t *)expected,
+                                             (int16_t)value);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uint32(uint32_t *obj, uint32_t *expected, uint32_t value)
+{
+    return _Py_atomic_compare_exchange_int32((int32_t *)obj,
+                                             (int32_t *)expected,
+                                             (int32_t)value);
+}
+
+static inline int
+_Py_atomic_compare_exchange_int(int *obj, int *expected, int value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(int32_t);
+    return _Py_atomic_compare_exchange_int32((int32_t *)obj,
+                                             (int32_t *)expected,
+                                             (int32_t)value);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uint(unsigned int *obj, unsigned int *expected, unsigned int value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(int32_t);
+    return _Py_atomic_compare_exchange_int32((int32_t *)obj,
+                                             (int32_t *)expected,
+                                             (int32_t)value);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uint64(uint64_t *obj, uint64_t *expected, uint64_t value)
+{
+    return _Py_atomic_compare_exchange_int64((int64_t *)obj,
+                                             (int64_t *)expected,
+                                             (int64_t)value);
+}
+
+static inline int
+_Py_atomic_compare_exchange_intptr(intptr_t *obj, intptr_t *expected, intptr_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(void*);
+    return _Py_atomic_compare_exchange_ptr((void**)obj,
+                                           (void**)expected,
+                                           (void*)value);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uintptr(uintptr_t *obj, uintptr_t *expected, uintptr_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(void*);
+    return _Py_atomic_compare_exchange_ptr((void**)obj,
+                                           (void**)expected,
+                                           (void*)value);
+}
+
+static inline int
+_Py_atomic_compare_exchange_ssize(Py_ssize_t *obj, Py_ssize_t *expected, Py_ssize_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(void*);
+    return _Py_atomic_compare_exchange_ptr((void**)obj,
+                                           (void**)expected,
+                                           (void*)value);
+}
+
+
+// --- _Py_atomic_exchange ---------------------------------------------------
+
+static inline int8_t
+_Py_atomic_exchange_int8(int8_t *obj, int8_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(char);
+    return (int8_t)_InterlockedExchange8((volatile char *)obj, (char)value);
+}
+
+static inline int16_t
+_Py_atomic_exchange_int16(int16_t *obj, int16_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(short);
+    return (int16_t)_InterlockedExchange16((volatile short *)obj, (short)value);
+}
+
+static inline int32_t
+_Py_atomic_exchange_int32(int32_t *obj, int32_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(long);
+    return (int32_t)_InterlockedExchange((volatile long *)obj, (long)value);
+}
+
+static inline int64_t
+_Py_atomic_exchange_int64(int64_t *obj, int64_t value)
+{
+#if defined(_M_X64) || defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(__int64);
+    return (int64_t)_InterlockedExchange64((volatile __int64 *)obj, (__int64)value);
+#else
+    int64_t old_value = _Py_atomic_load_int64_relaxed(obj);
+    for (;;) {
+        if (_Py_atomic_compare_exchange_int64(obj, &old_value, value)) {
+            return old_value;
+        }
+    }
+#endif
+}
+
+static inline void*
+_Py_atomic_exchange_ptr(void *obj, void *value)
+{
+    return (void*)_InterlockedExchangePointer((void * volatile *)obj, (void *)value);
+}
+
+
+static inline uint8_t
+_Py_atomic_exchange_uint8(uint8_t *obj, uint8_t value)
+{
+    return (uint8_t)_Py_atomic_exchange_int8((int8_t *)obj,
+                                             (int8_t)value);
+}
+
+static inline uint16_t
+_Py_atomic_exchange_uint16(uint16_t *obj, uint16_t value)
+{
+    return (uint16_t)_Py_atomic_exchange_int16((int16_t *)obj,
+                                               (int16_t)value);
+}
+
+static inline uint32_t
+_Py_atomic_exchange_uint32(uint32_t *obj, uint32_t value)
+{
+    return (uint32_t)_Py_atomic_exchange_int32((int32_t *)obj,
+                                               (int32_t)value);
+}
+
+static inline int
+_Py_atomic_exchange_int(int *obj, int value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(int32_t);
+    return (int)_Py_atomic_exchange_int32((int32_t *)obj,
+                                           (int32_t)value);
+}
+
+static inline unsigned int
+_Py_atomic_exchange_uint(unsigned int *obj, unsigned int value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(int32_t);
+    return (unsigned int)_Py_atomic_exchange_int32((int32_t *)obj,
+                                                   (int32_t)value);
+}
+
+static inline uint64_t
+_Py_atomic_exchange_uint64(uint64_t *obj, uint64_t value)
+{
+    return (uint64_t)_Py_atomic_exchange_int64((int64_t *)obj,
+                                               (int64_t)value);
+}
+
+static inline intptr_t
+_Py_atomic_exchange_intptr(intptr_t *obj, intptr_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(void*);
+    return (intptr_t)_Py_atomic_exchange_ptr((void**)obj,
+                                             (void*)value);
+}
+
+static inline uintptr_t
+_Py_atomic_exchange_uintptr(uintptr_t *obj, uintptr_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(void*);
+    return (uintptr_t)_Py_atomic_exchange_ptr((void**)obj,
+                                              (void*)value);
+}
+
+static inline Py_ssize_t
+_Py_atomic_exchange_ssize(Py_ssize_t *obj, Py_ssize_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(void*);
+    return (Py_ssize_t)_Py_atomic_exchange_ptr((void**)obj,
+                                               (void*)value);
+}
+
+
+// --- _Py_atomic_and --------------------------------------------------------
+
+static inline uint8_t
+_Py_atomic_and_uint8(uint8_t *obj, uint8_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(char);
+    return (uint8_t)_InterlockedAnd8((volatile char *)obj, (char)value);
+}
+
+static inline uint16_t
+_Py_atomic_and_uint16(uint16_t *obj, uint16_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(short);
+    return (uint16_t)_InterlockedAnd16((volatile short *)obj, (short)value);
+}
+
+static inline uint32_t
+_Py_atomic_and_uint32(uint32_t *obj, uint32_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(long);
+    return (uint32_t)_InterlockedAnd((volatile long *)obj, (long)value);
+}
+
+static inline uint64_t
+_Py_atomic_and_uint64(uint64_t *obj, uint64_t value)
+{
+#if defined(_M_X64) || defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(__int64);
+    return (uint64_t)_InterlockedAnd64((volatile __int64 *)obj, (__int64)value);
+#else
+    uint64_t old_value = _Py_atomic_load_uint64_relaxed(obj);
+    for (;;) {
+        uint64_t new_value = old_value & value;
+        if (_Py_atomic_compare_exchange_uint64(obj, &old_value, new_value)) {
+            return old_value;
+        }
+    }
+#endif
+}
+
+static inline uintptr_t
+_Py_atomic_and_uintptr(uintptr_t *obj, uintptr_t value)
+{
+#if SIZEOF_VOID_P == 8
+    _Py_atomic_ASSERT_ARG_TYPE(uint64_t);
+    return (uintptr_t)_Py_atomic_and_uint64((uint64_t *)obj,
+                                            (uint64_t)value);
+#else
+    _Py_atomic_ASSERT_ARG_TYPE(uint32_t);
+    return (uintptr_t)_Py_atomic_and_uint32((uint32_t *)obj,
+                                            (uint32_t)value);
+#endif
+}
+
+
+// --- _Py_atomic_or ---------------------------------------------------------
+
+static inline uint8_t
+_Py_atomic_or_uint8(uint8_t *obj, uint8_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(char);
+    return (uint8_t)_InterlockedOr8((volatile char *)obj, (char)value);
+}
+
+static inline uint16_t
+_Py_atomic_or_uint16(uint16_t *obj, uint16_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(short);
+    return (uint16_t)_InterlockedOr16((volatile short *)obj, (short)value);
+}
+
+static inline uint32_t
+_Py_atomic_or_uint32(uint32_t *obj, uint32_t value)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(long);
+    return (uint32_t)_InterlockedOr((volatile long *)obj, (long)value);
+}
+
+static inline uint64_t
+_Py_atomic_or_uint64(uint64_t *obj, uint64_t value)
+{
+#if defined(_M_X64) || defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(__int64);
+    return (uint64_t)_InterlockedOr64((volatile __int64 *)obj, (__int64)value);
+#else
+    uint64_t old_value = _Py_atomic_load_uint64_relaxed(obj);
+    for (;;) {
+        uint64_t new_value = old_value | value;
+        if (_Py_atomic_compare_exchange_uint64(obj, &old_value, new_value)) {
+            return old_value;
+        }
+    }
+#endif
+}
+
+
+static inline uintptr_t
+_Py_atomic_or_uintptr(uintptr_t *obj, uintptr_t value)
+{
+#if SIZEOF_VOID_P == 8
+    _Py_atomic_ASSERT_ARG_TYPE(uint64_t);
+    return (uintptr_t)_Py_atomic_or_uint64((uint64_t *)obj,
+                                           (uint64_t)value);
+#else
+    _Py_atomic_ASSERT_ARG_TYPE(uint32_t);
+    return (uintptr_t)_Py_atomic_or_uint32((uint32_t *)obj,
+                                           (uint32_t)value);
+#endif
+}
+
+
+// --- _Py_atomic_load -------------------------------------------------------
+
+static inline uint8_t
+_Py_atomic_load_uint8(const uint8_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(volatile uint8_t *)obj;
+#elif defined(_M_ARM64)
+    return (uint8_t)__ldar8((unsigned __int8 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_uint8"
+#endif
+}
+
+static inline uint16_t
+_Py_atomic_load_uint16(const uint16_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(volatile uint16_t *)obj;
+#elif defined(_M_ARM64)
+    return (uint16_t)__ldar16((unsigned __int16 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_uint16"
+#endif
+}
+
+static inline uint32_t
+_Py_atomic_load_uint32(const uint32_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(volatile uint32_t *)obj;
+#elif defined(_M_ARM64)
+    return (uint32_t)__ldar32((unsigned __int32 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_uint32"
+#endif
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64(const uint64_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(volatile uint64_t *)obj;
+#elif defined(_M_ARM64)
+    return (uint64_t)__ldar64((unsigned __int64 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_uint64"
+#endif
+}
+
+static inline int8_t
+_Py_atomic_load_int8(const int8_t *obj)
+{
+    return (int8_t)_Py_atomic_load_uint8((const uint8_t *)obj);
+}
+
+static inline int16_t
+_Py_atomic_load_int16(const int16_t *obj)
+{
+    return (int16_t)_Py_atomic_load_uint16((const uint16_t *)obj);
+}
+
+static inline int32_t
+_Py_atomic_load_int32(const int32_t *obj)
+{
+    return (int32_t)_Py_atomic_load_uint32((const uint32_t *)obj);
+}
+
+static inline int
+_Py_atomic_load_int(const int *obj)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(uint32_t);
+    return (int)_Py_atomic_load_uint32((uint32_t *)obj);
+}
+
+static inline unsigned int
+_Py_atomic_load_uint(const unsigned int *obj)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(uint32_t);
+    return (unsigned int)_Py_atomic_load_uint32((uint32_t *)obj);
+}
+
+static inline int64_t
+_Py_atomic_load_int64(const int64_t *obj)
+{
+    return (int64_t)_Py_atomic_load_uint64((const uint64_t *)obj);
+}
+
+static inline void*
+_Py_atomic_load_ptr(const void *obj)
+{
+#if SIZEOF_VOID_P == 8
+    return (void*)_Py_atomic_load_uint64((const uint64_t *)obj);
+#else
+    return (void*)_Py_atomic_load_uint32((const uint32_t *)obj);
+#endif
+}
+
+static inline intptr_t
+_Py_atomic_load_intptr(const intptr_t *obj)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(void*);
+    return (intptr_t)_Py_atomic_load_ptr((void*)obj);
+}
+
+static inline uintptr_t
+_Py_atomic_load_uintptr(const uintptr_t *obj)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(void*);
+    return (uintptr_t)_Py_atomic_load_ptr((void*)obj);
+}
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize(const Py_ssize_t *obj)
+{
+    _Py_atomic_ASSERT_ARG_TYPE(void*);
+    return (Py_ssize_t)_Py_atomic_load_ptr((void*)obj);
+}
+
+
+// --- _Py_atomic_load_relaxed -----------------------------------------------
+
+static inline int
+_Py_atomic_load_int_relaxed(const int *obj)
+{
+    return *(volatile int *)obj;
+}
+
+static inline int8_t
+_Py_atomic_load_int8_relaxed(const int8_t *obj)
+{
+    return *(volatile int8_t *)obj;
+}
+
+static inline int16_t
+_Py_atomic_load_int16_relaxed(const int16_t *obj)
+{
+    return *(volatile int16_t *)obj;
+}
+
+static inline int32_t
+_Py_atomic_load_int32_relaxed(const int32_t *obj)
+{
+    return *(volatile int32_t *)obj;
+}
+
+static inline int64_t
+_Py_atomic_load_int64_relaxed(const int64_t *obj)
+{
+    return *(volatile int64_t *)obj;
+}
+
+static inline intptr_t
+_Py_atomic_load_intptr_relaxed(const intptr_t *obj)
+{
+    return *(volatile intptr_t *)obj;
+}
+
+static inline uint8_t
+_Py_atomic_load_uint8_relaxed(const uint8_t *obj)
+{
+    return *(volatile uint8_t *)obj;
+}
+
+static inline uint16_t
+_Py_atomic_load_uint16_relaxed(const uint16_t *obj)
+{
+    return *(volatile uint16_t *)obj;
+}
+
+static inline uint32_t
+_Py_atomic_load_uint32_relaxed(const uint32_t *obj)
+{
+    return *(volatile uint32_t *)obj;
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64_relaxed(const uint64_t *obj)
+{
+    return *(volatile uint64_t *)obj;
+}
+
+static inline uintptr_t
+_Py_atomic_load_uintptr_relaxed(const uintptr_t *obj)
+{
+    return *(volatile uintptr_t *)obj;
+}
+
+static inline unsigned int
+_Py_atomic_load_uint_relaxed(const unsigned int *obj)
+{
+    return *(volatile unsigned int *)obj;
+}
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize_relaxed(const Py_ssize_t *obj)
+{
+    return *(volatile Py_ssize_t *)obj;
+}
+
+static inline void*
+_Py_atomic_load_ptr_relaxed(const void *obj)
+{
+    return *(void * volatile *)obj;
+}
+
+static inline unsigned long long
+_Py_atomic_load_ullong_relaxed(const unsigned long long *obj)
+{
+    return *(volatile unsigned long long *)obj;
+}
+
+
+// --- _Py_atomic_store ------------------------------------------------------
+
+static inline void
+_Py_atomic_store_int(int *obj, int value)
+{
+    (void)_Py_atomic_exchange_int(obj, value);
+}
+
+static inline void
+_Py_atomic_store_int8(int8_t *obj, int8_t value)
+{
+    (void)_Py_atomic_exchange_int8(obj, value);
+}
+
+static inline void
+_Py_atomic_store_int16(int16_t *obj, int16_t value)
+{
+    (void)_Py_atomic_exchange_int16(obj, value);
+}
+
+static inline void
+_Py_atomic_store_int32(int32_t *obj, int32_t value)
+{
+    (void)_Py_atomic_exchange_int32(obj, value);
+}
+
+static inline void
+_Py_atomic_store_int64(int64_t *obj, int64_t value)
+{
+    (void)_Py_atomic_exchange_int64(obj, value);
+}
+
+static inline void
+_Py_atomic_store_intptr(intptr_t *obj, intptr_t value)
+{
+    (void)_Py_atomic_exchange_intptr(obj, value);
+}
+
+static inline void
+_Py_atomic_store_uint8(uint8_t *obj, uint8_t value)
+{
+    (void)_Py_atomic_exchange_uint8(obj, value);
+}
+
+static inline void
+_Py_atomic_store_uint16(uint16_t *obj, uint16_t value)
+{
+    (void)_Py_atomic_exchange_uint16(obj, value);
+}
+
+static inline void
+_Py_atomic_store_uint32(uint32_t *obj, uint32_t value)
+{
+    (void)_Py_atomic_exchange_uint32(obj, value);
+}
+
+static inline void
+_Py_atomic_store_uint64(uint64_t *obj, uint64_t value)
+{
+    (void)_Py_atomic_exchange_uint64(obj, value);
+}
+
+static inline void
+_Py_atomic_store_uintptr(uintptr_t *obj, uintptr_t value)
+{
+    (void)_Py_atomic_exchange_uintptr(obj, value);
+}
+
+static inline void
+_Py_atomic_store_uint(unsigned int *obj, unsigned int value)
+{
+    (void)_Py_atomic_exchange_uint(obj, value);
+}
+
+static inline void
+_Py_atomic_store_ptr(void *obj, void *value)
+{
+    (void)_Py_atomic_exchange_ptr(obj, value);
+}
+
+static inline void
+_Py_atomic_store_ssize(Py_ssize_t *obj, Py_ssize_t value)
+{
+    (void)_Py_atomic_exchange_ssize(obj, value);
+}
+
+
+// --- _Py_atomic_store_relaxed ----------------------------------------------
+
+static inline void
+_Py_atomic_store_int_relaxed(int *obj, int value)
+{
+    *(volatile int *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_int8_relaxed(int8_t *obj, int8_t value)
+{
+    *(volatile int8_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_int16_relaxed(int16_t *obj, int16_t value)
+{
+    *(volatile int16_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_int32_relaxed(int32_t *obj, int32_t value)
+{
+    *(volatile int32_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_int64_relaxed(int64_t *obj, int64_t value)
+{
+    *(volatile int64_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_intptr_relaxed(intptr_t *obj, intptr_t value)
+{
+    *(volatile intptr_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_uint8_relaxed(uint8_t *obj, uint8_t value)
+{
+    *(volatile uint8_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_uint16_relaxed(uint16_t *obj, uint16_t value)
+{
+    *(volatile uint16_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_uint32_relaxed(uint32_t *obj, uint32_t value)
+{
+    *(volatile uint32_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_uint64_relaxed(uint64_t *obj, uint64_t value)
+{
+    *(volatile uint64_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_uintptr_relaxed(uintptr_t *obj, uintptr_t value)
+{
+    *(volatile uintptr_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_uint_relaxed(unsigned int *obj, unsigned int value)
+{
+    *(volatile unsigned int *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_ptr_relaxed(void *obj, void* value)
+{
+    *(void * volatile *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_ssize_relaxed(Py_ssize_t *obj, Py_ssize_t value)
+{
+    *(volatile Py_ssize_t *)obj = value;
+}
+
+static inline void
+_Py_atomic_store_ullong_relaxed(unsigned long long *obj,
+                                unsigned long long value)
+{
+    *(volatile unsigned long long *)obj = value;
+}
+
+
+// --- _Py_atomic_load_ptr_acquire / _Py_atomic_store_ptr_release ------------
+
+static inline void *
+_Py_atomic_load_ptr_acquire(const void *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(void * volatile *)obj;
+#elif defined(_M_ARM64)
+    return (void *)__ldar64((unsigned __int64 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_ptr_acquire"
+#endif
+}
+
+static inline uintptr_t
+_Py_atomic_load_uintptr_acquire(const uintptr_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(uintptr_t volatile *)obj;
+#elif defined(_M_ARM64)
+    return (uintptr_t)__ldar64((unsigned __int64 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_uintptr_acquire"
+#endif
+}
+
+static inline void
+_Py_atomic_store_ptr_release(void *obj, void *value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    *(void * volatile *)obj = value;
+#elif defined(_M_ARM64)
+    __stlr64((unsigned __int64 volatile *)obj, (uintptr_t)value);
+#else
+#  error "no implementation of _Py_atomic_store_ptr_release"
+#endif
+}
+
+static inline void
+_Py_atomic_store_uintptr_release(uintptr_t *obj, uintptr_t value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    *(uintptr_t volatile *)obj = value;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(unsigned __int64);
+    __stlr64((unsigned __int64 volatile *)obj, (unsigned __int64)value);
+#else
+#  error "no implementation of _Py_atomic_store_uintptr_release"
+#endif
+}
+
+static inline void
+_Py_atomic_store_int_release(int *obj, int value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    *(int volatile *)obj = value;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(unsigned __int32);
+    __stlr32((unsigned __int32 volatile *)obj, (unsigned __int32)value);
+#else
+#  error "no implementation of _Py_atomic_store_int_release"
+#endif
+}
+
+static inline void
+_Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    *(Py_ssize_t volatile *)obj = value;
+#elif defined(_M_ARM64)
+    __stlr64((unsigned __int64 volatile *)obj, (unsigned __int64)value);
+#else
+#  error "no implementation of _Py_atomic_store_ssize_release"
+#endif
+}
+
+static inline int
+_Py_atomic_load_int_acquire(const int *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(int volatile *)obj;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(unsigned __int32);
+    return (int)__ldar32((unsigned __int32 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_int_acquire"
+#endif
+}
+
+static inline void
+_Py_atomic_store_uint32_release(uint32_t *obj, uint32_t value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    *(uint32_t volatile *)obj = value;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(unsigned __int32);
+    __stlr32((unsigned __int32 volatile *)obj, (unsigned __int32)value);
+#else
+#  error "no implementation of _Py_atomic_store_uint32_release"
+#endif
+}
+
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    *(uint64_t volatile *)obj = value;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(unsigned __int64);
+    __stlr64((unsigned __int64 volatile *)obj, (unsigned __int64)value);
+#else
+#  error "no implementation of _Py_atomic_store_uint64_release"
+#endif
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(uint64_t volatile *)obj;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(__int64);
+    return (uint64_t)__ldar64((unsigned __int64 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_uint64_acquire"
+#endif
+}
+
+static inline uint32_t
+_Py_atomic_load_uint32_acquire(const uint32_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(uint32_t volatile *)obj;
+#elif defined(_M_ARM64)
+    return (uint32_t)__ldar32((uint32_t volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_uint32_acquire"
+#endif
+}
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize_acquire(const Py_ssize_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(Py_ssize_t volatile *)obj;
+#elif defined(_M_ARM64)
+    return (Py_ssize_t)__ldar64((unsigned __int64 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_ssize_acquire"
+#endif
+}
+
+// --- _Py_atomic_fence ------------------------------------------------------
+
+ static inline void
+_Py_atomic_fence_seq_cst(void)
+{
+#if defined(_M_ARM64)
+    __dmb(_ARM64_BARRIER_ISH);
+#elif defined(_M_X64)
+    __faststorefence();
+#elif defined(_M_IX86)
+    _mm_mfence();
+#else
+#  error "no implementation of _Py_atomic_fence_seq_cst"
+#endif
+}
+
+ static inline void
+_Py_atomic_fence_acquire(void)
+{
+#if defined(_M_ARM64)
+    __dmb(_ARM64_BARRIER_ISHLD);
+#elif defined(_M_X64) || defined(_M_IX86)
+    _ReadBarrier();
+#else
+#  error "no implementation of _Py_atomic_fence_acquire"
+#endif
+}
+
+ static inline void
+_Py_atomic_fence_release(void)
+{
+#if defined(_M_ARM64)
+    __dmb(_ARM64_BARRIER_ISH);
+#elif defined(_M_X64) || defined(_M_IX86)
+    _ReadWriteBarrier();
+#else
+#  error "no implementation of _Py_atomic_fence_release"
+#endif
+}
+
+#undef _Py_atomic_ASSERT_ARG_TYPE
diff --git a/Include/cpython/pyatomic_std.h b/Include/cpython/pyatomic_std.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c71e94c68f8e6d16664201bd71f8538c10d361d
--- /dev/null
+++ b/Include/cpython/pyatomic_std.h
@@ -0,0 +1,976 @@
+// This is the implementation of Python atomic operations using C++11 or C11
+// atomics. Note that the pyatomic_gcc.h implementation is preferred for GCC
+// compatible compilers, even if they support C++11 atomics.
+
+#ifndef Py_ATOMIC_STD_H
+#  error "this header file must not be included directly"
+#endif
+
+#ifdef __cplusplus
+extern "C++" {
+#  include <atomic>
+}
+#  define _Py_USING_STD using namespace std
+#  define _Atomic(tp) atomic<tp>
+#else
+#  define  _Py_USING_STD
+#  include <stdatomic.h>
+#endif
+
+
+// --- _Py_atomic_add --------------------------------------------------------
+
+static inline int
+_Py_atomic_add_int(int *obj, int value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(int)*)obj, value);
+}
+
+static inline int8_t
+_Py_atomic_add_int8(int8_t *obj, int8_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(int8_t)*)obj, value);
+}
+
+static inline int16_t
+_Py_atomic_add_int16(int16_t *obj, int16_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(int16_t)*)obj, value);
+}
+
+static inline int32_t
+_Py_atomic_add_int32(int32_t *obj, int32_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(int32_t)*)obj, value);
+}
+
+static inline int64_t
+_Py_atomic_add_int64(int64_t *obj, int64_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(int64_t)*)obj, value);
+}
+
+static inline intptr_t
+_Py_atomic_add_intptr(intptr_t *obj, intptr_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(intptr_t)*)obj, value);
+}
+
+static inline unsigned int
+_Py_atomic_add_uint(unsigned int *obj, unsigned int value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(unsigned int)*)obj, value);
+}
+
+static inline uint8_t
+_Py_atomic_add_uint8(uint8_t *obj, uint8_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(uint8_t)*)obj, value);
+}
+
+static inline uint16_t
+_Py_atomic_add_uint16(uint16_t *obj, uint16_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(uint16_t)*)obj, value);
+}
+
+static inline uint32_t
+_Py_atomic_add_uint32(uint32_t *obj, uint32_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(uint32_t)*)obj, value);
+}
+
+static inline uint64_t
+_Py_atomic_add_uint64(uint64_t *obj, uint64_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(uint64_t)*)obj, value);
+}
+
+static inline uintptr_t
+_Py_atomic_add_uintptr(uintptr_t *obj, uintptr_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(uintptr_t)*)obj, value);
+}
+
+static inline Py_ssize_t
+_Py_atomic_add_ssize(Py_ssize_t *obj, Py_ssize_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_add((_Atomic(Py_ssize_t)*)obj, value);
+}
+
+
+// --- _Py_atomic_compare_exchange -------------------------------------------
+
+static inline int
+_Py_atomic_compare_exchange_int(int *obj, int *expected, int desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(int)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_int8(int8_t *obj, int8_t *expected, int8_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(int8_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_int16(int16_t *obj, int16_t *expected, int16_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(int16_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_int32(int32_t *obj, int32_t *expected, int32_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(int32_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_int64(int64_t *obj, int64_t *expected, int64_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(int64_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_intptr(intptr_t *obj, intptr_t *expected, intptr_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(intptr_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uint(unsigned int *obj, unsigned int *expected, unsigned int desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(unsigned int)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uint8(uint8_t *obj, uint8_t *expected, uint8_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(uint8_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uint16(uint16_t *obj, uint16_t *expected, uint16_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(uint16_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uint32(uint32_t *obj, uint32_t *expected, uint32_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(uint32_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uint64(uint64_t *obj, uint64_t *expected, uint64_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(uint64_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_uintptr(uintptr_t *obj, uintptr_t *expected, uintptr_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(uintptr_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_ssize(Py_ssize_t *obj, Py_ssize_t *expected, Py_ssize_t desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(Py_ssize_t)*)obj,
+                                          expected, desired);
+}
+
+static inline int
+_Py_atomic_compare_exchange_ptr(void *obj, void *expected, void *desired)
+{
+    _Py_USING_STD;
+    return atomic_compare_exchange_strong((_Atomic(void *)*)obj,
+                                          (void **)expected, desired);
+}
+
+
+// --- _Py_atomic_exchange ---------------------------------------------------
+
+static inline int
+_Py_atomic_exchange_int(int *obj, int value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(int)*)obj, value);
+}
+
+static inline int8_t
+_Py_atomic_exchange_int8(int8_t *obj, int8_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(int8_t)*)obj, value);
+}
+
+static inline int16_t
+_Py_atomic_exchange_int16(int16_t *obj, int16_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(int16_t)*)obj, value);
+}
+
+static inline int32_t
+_Py_atomic_exchange_int32(int32_t *obj, int32_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(int32_t)*)obj, value);
+}
+
+static inline int64_t
+_Py_atomic_exchange_int64(int64_t *obj, int64_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(int64_t)*)obj, value);
+}
+
+static inline intptr_t
+_Py_atomic_exchange_intptr(intptr_t *obj, intptr_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(intptr_t)*)obj, value);
+}
+
+static inline unsigned int
+_Py_atomic_exchange_uint(unsigned int *obj, unsigned int value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(unsigned int)*)obj, value);
+}
+
+static inline uint8_t
+_Py_atomic_exchange_uint8(uint8_t *obj, uint8_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(uint8_t)*)obj, value);
+}
+
+static inline uint16_t
+_Py_atomic_exchange_uint16(uint16_t *obj, uint16_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(uint16_t)*)obj, value);
+}
+
+static inline uint32_t
+_Py_atomic_exchange_uint32(uint32_t *obj, uint32_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(uint32_t)*)obj, value);
+}
+
+static inline uint64_t
+_Py_atomic_exchange_uint64(uint64_t *obj, uint64_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(uint64_t)*)obj, value);
+}
+
+static inline uintptr_t
+_Py_atomic_exchange_uintptr(uintptr_t *obj, uintptr_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(uintptr_t)*)obj, value);
+}
+
+static inline Py_ssize_t
+_Py_atomic_exchange_ssize(Py_ssize_t *obj, Py_ssize_t value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(Py_ssize_t)*)obj, value);
+}
+
+static inline void*
+_Py_atomic_exchange_ptr(void *obj, void *value)
+{
+    _Py_USING_STD;
+    return atomic_exchange((_Atomic(void *)*)obj, value);
+}
+
+
+// --- _Py_atomic_and --------------------------------------------------------
+
+static inline uint8_t
+_Py_atomic_and_uint8(uint8_t *obj, uint8_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_and((_Atomic(uint8_t)*)obj, value);
+}
+
+static inline uint16_t
+_Py_atomic_and_uint16(uint16_t *obj, uint16_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_and((_Atomic(uint16_t)*)obj, value);
+}
+
+static inline uint32_t
+_Py_atomic_and_uint32(uint32_t *obj, uint32_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_and((_Atomic(uint32_t)*)obj, value);
+}
+
+static inline uint64_t
+_Py_atomic_and_uint64(uint64_t *obj, uint64_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_and((_Atomic(uint64_t)*)obj, value);
+}
+
+static inline uintptr_t
+_Py_atomic_and_uintptr(uintptr_t *obj, uintptr_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_and((_Atomic(uintptr_t)*)obj, value);
+}
+
+
+// --- _Py_atomic_or ---------------------------------------------------------
+
+static inline uint8_t
+_Py_atomic_or_uint8(uint8_t *obj, uint8_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_or((_Atomic(uint8_t)*)obj, value);
+}
+
+static inline uint16_t
+_Py_atomic_or_uint16(uint16_t *obj, uint16_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_or((_Atomic(uint16_t)*)obj, value);
+}
+
+static inline uint32_t
+_Py_atomic_or_uint32(uint32_t *obj, uint32_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_or((_Atomic(uint32_t)*)obj, value);
+}
+
+static inline uint64_t
+_Py_atomic_or_uint64(uint64_t *obj, uint64_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_or((_Atomic(uint64_t)*)obj, value);
+}
+
+static inline uintptr_t
+_Py_atomic_or_uintptr(uintptr_t *obj, uintptr_t value)
+{
+    _Py_USING_STD;
+    return atomic_fetch_or((_Atomic(uintptr_t)*)obj, value);
+}
+
+
+// --- _Py_atomic_load -------------------------------------------------------
+
+static inline int
+_Py_atomic_load_int(const int *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(int)*)obj);
+}
+
+static inline int8_t
+_Py_atomic_load_int8(const int8_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(int8_t)*)obj);
+}
+
+static inline int16_t
+_Py_atomic_load_int16(const int16_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(int16_t)*)obj);
+}
+
+static inline int32_t
+_Py_atomic_load_int32(const int32_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(int32_t)*)obj);
+}
+
+static inline int64_t
+_Py_atomic_load_int64(const int64_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(int64_t)*)obj);
+}
+
+static inline intptr_t
+_Py_atomic_load_intptr(const intptr_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(intptr_t)*)obj);
+}
+
+static inline uint8_t
+_Py_atomic_load_uint8(const uint8_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(uint8_t)*)obj);
+}
+
+static inline uint16_t
+_Py_atomic_load_uint16(const uint16_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(uint32_t)*)obj);
+}
+
+static inline uint32_t
+_Py_atomic_load_uint32(const uint32_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(uint32_t)*)obj);
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64(const uint64_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(uint64_t)*)obj);
+}
+
+static inline uintptr_t
+_Py_atomic_load_uintptr(const uintptr_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(uintptr_t)*)obj);
+}
+
+static inline unsigned int
+_Py_atomic_load_uint(const unsigned int *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(unsigned int)*)obj);
+}
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize(const Py_ssize_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(Py_ssize_t)*)obj);
+}
+
+static inline void*
+_Py_atomic_load_ptr(const void *obj)
+{
+    _Py_USING_STD;
+    return atomic_load((const _Atomic(void*)*)obj);
+}
+
+
+// --- _Py_atomic_load_relaxed -----------------------------------------------
+
+static inline int
+_Py_atomic_load_int_relaxed(const int *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(int)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline int8_t
+_Py_atomic_load_int8_relaxed(const int8_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(int8_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline int16_t
+_Py_atomic_load_int16_relaxed(const int16_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(int16_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline int32_t
+_Py_atomic_load_int32_relaxed(const int32_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(int32_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline int64_t
+_Py_atomic_load_int64_relaxed(const int64_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(int64_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline intptr_t
+_Py_atomic_load_intptr_relaxed(const intptr_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(intptr_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline uint8_t
+_Py_atomic_load_uint8_relaxed(const uint8_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uint8_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline uint16_t
+_Py_atomic_load_uint16_relaxed(const uint16_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uint16_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline uint32_t
+_Py_atomic_load_uint32_relaxed(const uint32_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uint32_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64_relaxed(const uint64_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uint64_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline uintptr_t
+_Py_atomic_load_uintptr_relaxed(const uintptr_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uintptr_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline unsigned int
+_Py_atomic_load_uint_relaxed(const unsigned int *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(unsigned int)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize_relaxed(const Py_ssize_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(Py_ssize_t)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline void*
+_Py_atomic_load_ptr_relaxed(const void *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(void*)*)obj,
+                                memory_order_relaxed);
+}
+
+static inline unsigned long long
+_Py_atomic_load_ullong_relaxed(const unsigned long long *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(unsigned long long)*)obj,
+                                memory_order_relaxed);
+}
+
+
+// --- _Py_atomic_store ------------------------------------------------------
+
+static inline void
+_Py_atomic_store_int(int *obj, int value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(int)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_int8(int8_t *obj, int8_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(int8_t)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_int16(int16_t *obj, int16_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(int16_t)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_int32(int32_t *obj, int32_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(int32_t)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_int64(int64_t *obj, int64_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(int64_t)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_intptr(intptr_t *obj, intptr_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(intptr_t)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_uint8(uint8_t *obj, uint8_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(uint8_t)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_uint16(uint16_t *obj, uint16_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(uint16_t)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_uint32(uint32_t *obj, uint32_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(uint32_t)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_uint64(uint64_t *obj, uint64_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(uint64_t)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_uintptr(uintptr_t *obj, uintptr_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(uintptr_t)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_uint(unsigned int *obj, unsigned int value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(unsigned int)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_ptr(void *obj, void *value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(void*)*)obj, value);
+}
+
+static inline void
+_Py_atomic_store_ssize(Py_ssize_t *obj, Py_ssize_t value)
+{
+    _Py_USING_STD;
+    atomic_store((_Atomic(Py_ssize_t)*)obj, value);
+}
+
+
+// --- _Py_atomic_store_relaxed ----------------------------------------------
+
+static inline void
+_Py_atomic_store_int_relaxed(int *obj, int value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(int)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_int8_relaxed(int8_t *obj, int8_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(int8_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_int16_relaxed(int16_t *obj, int16_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(int16_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_int32_relaxed(int32_t *obj, int32_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(int32_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_int64_relaxed(int64_t *obj, int64_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(int64_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_intptr_relaxed(intptr_t *obj, intptr_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(intptr_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_uint8_relaxed(uint8_t *obj, uint8_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(uint8_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_uint16_relaxed(uint16_t *obj, uint16_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(uint16_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_uint32_relaxed(uint32_t *obj, uint32_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(uint32_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_uint64_relaxed(uint64_t *obj, uint64_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(uint64_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_uintptr_relaxed(uintptr_t *obj, uintptr_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(uintptr_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_uint_relaxed(unsigned int *obj, unsigned int value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(unsigned int)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_ptr_relaxed(void *obj, void *value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(void*)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_ssize_relaxed(Py_ssize_t *obj, Py_ssize_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(Py_ssize_t)*)obj, value,
+                          memory_order_relaxed);
+}
+
+static inline void
+_Py_atomic_store_ullong_relaxed(unsigned long long *obj,
+                                unsigned long long value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(unsigned long long)*)obj, value,
+                          memory_order_relaxed);
+}
+
+
+// --- _Py_atomic_load_ptr_acquire / _Py_atomic_store_ptr_release ------------
+
+static inline void *
+_Py_atomic_load_ptr_acquire(const void *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(void*)*)obj,
+                                memory_order_acquire);
+}
+
+static inline uintptr_t
+_Py_atomic_load_uintptr_acquire(const uintptr_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uintptr_t)*)obj,
+                                memory_order_acquire);
+}
+
+static inline void
+_Py_atomic_store_ptr_release(void *obj, void *value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(void*)*)obj, value,
+                          memory_order_release);
+}
+
+static inline void
+_Py_atomic_store_uintptr_release(uintptr_t *obj, uintptr_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(uintptr_t)*)obj, value,
+                          memory_order_release);
+}
+
+static inline void
+_Py_atomic_store_int_release(int *obj, int value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(int)*)obj, value,
+                          memory_order_release);
+}
+
+static inline void
+_Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(Py_ssize_t)*)obj, value,
+                          memory_order_release);
+}
+
+static inline int
+_Py_atomic_load_int_acquire(const int *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(int)*)obj,
+                                memory_order_acquire);
+}
+
+static inline void
+_Py_atomic_store_uint32_release(uint32_t *obj, uint32_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(uint32_t)*)obj, value,
+                          memory_order_release);
+}
+
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(uint64_t)*)obj, value,
+                          memory_order_release);
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uint64_t)*)obj,
+                                memory_order_acquire);
+}
+
+static inline uint32_t
+_Py_atomic_load_uint32_acquire(const uint32_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uint32_t)*)obj,
+                                memory_order_acquire);
+}
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize_acquire(const Py_ssize_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(Py_ssize_t)*)obj,
+                                memory_order_acquire);
+}
+
+
+// --- _Py_atomic_fence ------------------------------------------------------
+
+ static inline void
+_Py_atomic_fence_seq_cst(void)
+{
+    _Py_USING_STD;
+    atomic_thread_fence(memory_order_seq_cst);
+}
+
+ static inline void
+_Py_atomic_fence_acquire(void)
+{
+    _Py_USING_STD;
+    atomic_thread_fence(memory_order_acquire);
+}
+
+ static inline void
+_Py_atomic_fence_release(void)
+{
+    _Py_USING_STD;
+    atomic_thread_fence(memory_order_release);
+}
diff --git a/Include/cpython/pyctype.h b/Include/cpython/pyctype.h
new file mode 100644
index 0000000000000000000000000000000000000000..729d93275e6c5365fb26fd521b1ff58de22b5fdb
--- /dev/null
+++ b/Include/cpython/pyctype.h
@@ -0,0 +1,39 @@
+#ifndef Py_LIMITED_API
+#ifndef PYCTYPE_H
+#define PYCTYPE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PY_CTF_LOWER  0x01
+#define PY_CTF_UPPER  0x02
+#define PY_CTF_ALPHA  (PY_CTF_LOWER|PY_CTF_UPPER)
+#define PY_CTF_DIGIT  0x04
+#define PY_CTF_ALNUM  (PY_CTF_ALPHA|PY_CTF_DIGIT)
+#define PY_CTF_SPACE  0x08
+#define PY_CTF_XDIGIT 0x10
+
+PyAPI_DATA(const unsigned int) _Py_ctype_table[256];
+
+/* Unlike their C counterparts, the following macros are not meant to
+ * handle an int with any of the values [EOF, 0-UCHAR_MAX]. The argument
+ * must be a signed/unsigned char. */
+#define Py_ISLOWER(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_LOWER)
+#define Py_ISUPPER(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_UPPER)
+#define Py_ISALPHA(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_ALPHA)
+#define Py_ISDIGIT(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_DIGIT)
+#define Py_ISXDIGIT(c) (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_XDIGIT)
+#define Py_ISALNUM(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_ALNUM)
+#define Py_ISSPACE(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_SPACE)
+
+PyAPI_DATA(const unsigned char) _Py_ctype_tolower[256];
+PyAPI_DATA(const unsigned char) _Py_ctype_toupper[256];
+
+#define Py_TOLOWER(c) (_Py_ctype_tolower[Py_CHARMASK(c)])
+#define Py_TOUPPER(c) (_Py_ctype_toupper[Py_CHARMASK(c)])
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !PYCTYPE_H */
+#endif /* !Py_LIMITED_API */
diff --git a/Include/cpython/pydebug.h b/Include/cpython/pydebug.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6ebd99ed7e2ff2755d7fdd1d11fa77ff374088b
--- /dev/null
+++ b/Include/cpython/pydebug.h
@@ -0,0 +1,38 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_PYDEBUG_H
+#define Py_PYDEBUG_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_DebugFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_VerboseFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_QuietFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_InteractiveFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_InspectFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_OptimizeFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_NoSiteFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_BytesWarningFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_FrozenFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_IgnoreEnvironmentFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_DontWriteBytecodeFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_NoUserSiteDirectory;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_UnbufferedStdioFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_HashRandomizationFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_IsolatedFlag;
+
+#ifdef MS_WINDOWS
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_LegacyWindowsFSEncodingFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_LegacyWindowsStdioFlag;
+#endif
+
+/* this is a wrapper around getenv() that pays attention to
+   Py_IgnoreEnvironmentFlag.  It should be used for getting variables like
+   PYTHONPATH and PYTHONHOME from the environment */
+PyAPI_FUNC(char*) Py_GETENV(const char *name);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYDEBUG_H */
+#endif /* Py_LIMITED_API */
diff --git a/Include/cpython/pyerrors.h b/Include/cpython/pyerrors.h
new file mode 100644
index 0000000000000000000000000000000000000000..b36b4681f5dddb58de67f1cb1f2c4b1b3720369a
--- /dev/null
+++ b/Include/cpython/pyerrors.h
@@ -0,0 +1,131 @@
+#ifndef Py_CPYTHON_ERRORS_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Error objects */
+
+/* PyException_HEAD defines the initial segment of every exception class. */
+#define PyException_HEAD PyObject_HEAD PyObject *dict;\
+             PyObject *args; PyObject *notes; PyObject *traceback;\
+             PyObject *context; PyObject *cause;\
+             char suppress_context;
+
+typedef struct {
+    PyException_HEAD
+} PyBaseExceptionObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *msg;
+    PyObject *excs;
+} PyBaseExceptionGroupObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *msg;
+    PyObject *filename;
+    PyObject *lineno;
+    PyObject *offset;
+    PyObject *end_lineno;
+    PyObject *end_offset;
+    PyObject *text;
+    PyObject *print_file_and_line;
+} PySyntaxErrorObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *msg;
+    PyObject *name;
+    PyObject *path;
+    PyObject *name_from;
+} PyImportErrorObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *encoding;
+    PyObject *object;
+    Py_ssize_t start;
+    Py_ssize_t end;
+    PyObject *reason;
+} PyUnicodeErrorObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *code;
+} PySystemExitObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *myerrno;
+    PyObject *strerror;
+    PyObject *filename;
+    PyObject *filename2;
+#ifdef MS_WINDOWS
+    PyObject *winerror;
+#endif
+    Py_ssize_t written;   /* only for BlockingIOError, -1 otherwise */
+} PyOSErrorObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *value;
+} PyStopIterationObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *name;
+} PyNameErrorObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *obj;
+    PyObject *name;
+} PyAttributeErrorObject;
+
+/* Compatibility typedefs */
+typedef PyOSErrorObject PyEnvironmentErrorObject;
+#ifdef MS_WINDOWS
+typedef PyOSErrorObject PyWindowsErrorObject;
+#endif
+
+/* Context manipulation (PEP 3134) */
+
+PyAPI_FUNC(void) _PyErr_ChainExceptions1(PyObject *);
+
+/* In exceptions.c */
+
+PyAPI_FUNC(PyObject*) PyUnstable_Exc_PrepReraiseStar(
+     PyObject *orig,
+     PyObject *excs);
+
+/* In signalmodule.c */
+
+PyAPI_FUNC(int) PySignal_SetWakeupFd(int fd);
+
+/* Support for adding program text to SyntaxErrors */
+
+PyAPI_FUNC(void) PyErr_SyntaxLocationObject(
+    PyObject *filename,
+    int lineno,
+    int col_offset);
+
+PyAPI_FUNC(void) PyErr_RangedSyntaxLocationObject(
+    PyObject *filename,
+    int lineno,
+    int col_offset,
+    int end_lineno,
+    int end_col_offset);
+
+PyAPI_FUNC(PyObject *) PyErr_ProgramTextObject(
+    PyObject *filename,
+    int lineno);
+
+PyAPI_FUNC(void) _Py_NO_RETURN _Py_FatalErrorFunc(
+    const char *func,
+    const char *message);
+
+PyAPI_FUNC(void) PyErr_FormatUnraisable(const char *, ...);
+
+PyAPI_DATA(PyObject *) PyExc_PythonFinalizationError;
+
+#define Py_FatalError(message) _Py_FatalErrorFunc(__func__, (message))
diff --git a/Include/cpython/pyfpe.h b/Include/cpython/pyfpe.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc2def63aa5527f4ac192aa663a14353a88e1774
--- /dev/null
+++ b/Include/cpython/pyfpe.h
@@ -0,0 +1,15 @@
+#ifndef Py_PYFPE_H
+#define Py_PYFPE_H
+/* Header excluded from the stable API */
+#ifndef Py_LIMITED_API
+
+/* These macros used to do something when Python was built with --with-fpectl,
+ * but support for that was dropped in 3.7. We continue to define them though,
+ * to avoid breaking API users.
+ */
+
+#define PyFPE_START_PROTECT(err_string, leave_stmt)
+#define PyFPE_END_PROTECT(v)
+
+#endif /* !defined(Py_LIMITED_API) */
+#endif /* !Py_PYFPE_H */
diff --git a/Include/cpython/pyframe.h b/Include/cpython/pyframe.h
new file mode 100644
index 0000000000000000000000000000000000000000..eeafbb17a56badddb542d8d13cbcd799ea1403eb
--- /dev/null
+++ b/Include/cpython/pyframe.h
@@ -0,0 +1,45 @@
+#ifndef Py_CPYTHON_PYFRAME_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_DATA(PyTypeObject) PyFrame_Type;
+PyAPI_DATA(PyTypeObject) PyFrameLocalsProxy_Type;
+
+#define PyFrame_Check(op) Py_IS_TYPE((op), &PyFrame_Type)
+#define PyFrameLocalsProxy_Check(op) Py_IS_TYPE((op), &PyFrameLocalsProxy_Type)
+
+PyAPI_FUNC(PyFrameObject *) PyFrame_GetBack(PyFrameObject *frame);
+PyAPI_FUNC(PyObject *) PyFrame_GetLocals(PyFrameObject *frame);
+
+PyAPI_FUNC(PyObject *) PyFrame_GetGlobals(PyFrameObject *frame);
+PyAPI_FUNC(PyObject *) PyFrame_GetBuiltins(PyFrameObject *frame);
+
+PyAPI_FUNC(PyObject *) PyFrame_GetGenerator(PyFrameObject *frame);
+PyAPI_FUNC(int) PyFrame_GetLasti(PyFrameObject *frame);
+PyAPI_FUNC(PyObject*) PyFrame_GetVar(PyFrameObject *frame, PyObject *name);
+PyAPI_FUNC(PyObject*) PyFrame_GetVarString(PyFrameObject *frame, const char *name);
+
+/* The following functions are for use by debuggers and other tools
+ * implementing custom frame evaluators with PEP 523. */
+
+struct _PyInterpreterFrame;
+
+/* Returns the code object of the frame (strong reference).
+ * Does not raise an exception. */
+PyAPI_FUNC(PyObject *) PyUnstable_InterpreterFrame_GetCode(struct _PyInterpreterFrame *frame);
+
+/* Returns a byte ofsset into the last executed instruction.
+ * Does not raise an exception. */
+PyAPI_FUNC(int) PyUnstable_InterpreterFrame_GetLasti(struct _PyInterpreterFrame *frame);
+
+/* Returns the currently executing line number, or -1 if there is no line number.
+ * Does not raise an exception. */
+PyAPI_FUNC(int) PyUnstable_InterpreterFrame_GetLine(struct _PyInterpreterFrame *frame);
+
+#define PyUnstable_EXECUTABLE_KIND_SKIP 0
+#define PyUnstable_EXECUTABLE_KIND_PY_FUNCTION 1
+#define PyUnstable_EXECUTABLE_KIND_BUILTIN_FUNCTION 3
+#define PyUnstable_EXECUTABLE_KIND_METHOD_DESCRIPTOR 4
+#define PyUnstable_EXECUTABLE_KINDS 5
+
+PyAPI_DATA(const PyTypeObject *) const PyUnstable_ExecutableKinds[PyUnstable_EXECUTABLE_KINDS+1];
diff --git a/Include/cpython/pyhash.h b/Include/cpython/pyhash.h
new file mode 100644
index 0000000000000000000000000000000000000000..825c034a8d8474843102de497f4f973125e0674c
--- /dev/null
+++ b/Include/cpython/pyhash.h
@@ -0,0 +1,47 @@
+#ifndef Py_CPYTHON_HASH_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Prime multiplier used in string and various other hashes. */
+#define PyHASH_MULTIPLIER 1000003UL  /* 0xf4243 */
+
+/* Parameters used for the numeric hash implementation.  See notes for
+   _Py_HashDouble in Python/pyhash.c.  Numeric hashes are based on
+   reduction modulo the prime 2**_PyHASH_BITS - 1. */
+
+#if SIZEOF_VOID_P >= 8
+#  define PyHASH_BITS 61
+#else
+#  define PyHASH_BITS 31
+#endif
+
+#define PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1)
+#define PyHASH_INF 314159
+#define PyHASH_IMAG PyHASH_MULTIPLIER
+
+/* Aliases kept for backward compatibility with Python 3.12 */
+#define _PyHASH_MULTIPLIER PyHASH_MULTIPLIER
+#define _PyHASH_BITS PyHASH_BITS
+#define _PyHASH_MODULUS PyHASH_MODULUS
+#define _PyHASH_INF PyHASH_INF
+#define _PyHASH_IMAG PyHASH_IMAG
+
+/* Helpers for hash functions */
+PyAPI_FUNC(Py_hash_t) _Py_HashDouble(PyObject *, double);
+
+// Kept for backward compatibility
+#define _Py_HashPointer Py_HashPointer
+
+
+/* hash function definition */
+typedef struct {
+    Py_hash_t (*const hash)(const void *, Py_ssize_t);
+    const char *name;
+    const int hash_bits;
+    const int seed_bits;
+} PyHash_FuncDef;
+
+PyAPI_FUNC(PyHash_FuncDef*) PyHash_GetFuncDef(void);
+
+PyAPI_FUNC(Py_hash_t) Py_HashPointer(const void *ptr);
+PyAPI_FUNC(Py_hash_t) PyObject_GenericHash(PyObject *);
diff --git a/Include/cpython/pylifecycle.h b/Include/cpython/pylifecycle.h
new file mode 100644
index 0000000000000000000000000000000000000000..e46dfe59ec463044b06d3065dd3e4bfb26821e38
--- /dev/null
+++ b/Include/cpython/pylifecycle.h
@@ -0,0 +1,92 @@
+#ifndef Py_CPYTHON_PYLIFECYCLE_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Py_FrozenMain is kept out of the Limited API until documented and present
+   in all builds of Python */
+PyAPI_FUNC(int) Py_FrozenMain(int argc, char **argv);
+
+/* PEP 432 Multi-phase initialization API (Private while provisional!) */
+
+PyAPI_FUNC(PyStatus) Py_PreInitialize(
+    const PyPreConfig *src_config);
+PyAPI_FUNC(PyStatus) Py_PreInitializeFromBytesArgs(
+    const PyPreConfig *src_config,
+    Py_ssize_t argc,
+    char **argv);
+PyAPI_FUNC(PyStatus) Py_PreInitializeFromArgs(
+    const PyPreConfig *src_config,
+    Py_ssize_t argc,
+    wchar_t **argv);
+
+
+/* Initialization and finalization */
+
+PyAPI_FUNC(PyStatus) Py_InitializeFromConfig(
+    const PyConfig *config);
+
+// Python 3.8 provisional API (PEP 587)
+PyAPI_FUNC(PyStatus) _Py_InitializeMain(void);
+
+PyAPI_FUNC(int) Py_RunMain(void);
+
+
+PyAPI_FUNC(void) _Py_NO_RETURN Py_ExitStatusException(PyStatus err);
+
+PyAPI_FUNC(int) Py_FdIsInteractive(FILE *, const char *);
+
+/* --- PyInterpreterConfig ------------------------------------ */
+
+#define PyInterpreterConfig_DEFAULT_GIL (0)
+#define PyInterpreterConfig_SHARED_GIL (1)
+#define PyInterpreterConfig_OWN_GIL (2)
+
+typedef struct {
+    // XXX "allow_object_sharing"?  "own_objects"?
+    int use_main_obmalloc;
+    int allow_fork;
+    int allow_exec;
+    int allow_threads;
+    int allow_daemon_threads;
+    int check_multi_interp_extensions;
+    int gil;
+} PyInterpreterConfig;
+
+#define _PyInterpreterConfig_INIT \
+    { \
+        .use_main_obmalloc = 0, \
+        .allow_fork = 0, \
+        .allow_exec = 0, \
+        .allow_threads = 1, \
+        .allow_daemon_threads = 0, \
+        .check_multi_interp_extensions = 1, \
+        .gil = PyInterpreterConfig_OWN_GIL, \
+    }
+
+// gh-117649: The free-threaded build does not currently support single-phase
+// init extensions in subinterpreters. For now, we ensure that
+// `check_multi_interp_extensions` is always `1`, even in the legacy config.
+#ifdef Py_GIL_DISABLED
+#  define _PyInterpreterConfig_LEGACY_CHECK_MULTI_INTERP_EXTENSIONS 1
+#else
+#  define _PyInterpreterConfig_LEGACY_CHECK_MULTI_INTERP_EXTENSIONS 0
+#endif
+
+#define _PyInterpreterConfig_LEGACY_INIT \
+    { \
+        .use_main_obmalloc = 1, \
+        .allow_fork = 1, \
+        .allow_exec = 1, \
+        .allow_threads = 1, \
+        .allow_daemon_threads = 1, \
+        .check_multi_interp_extensions = _PyInterpreterConfig_LEGACY_CHECK_MULTI_INTERP_EXTENSIONS, \
+        .gil = PyInterpreterConfig_SHARED_GIL, \
+    }
+
+PyAPI_FUNC(PyStatus) Py_NewInterpreterFromConfig(
+    PyThreadState **tstate_p,
+    const PyInterpreterConfig *config);
+
+typedef void (*atexit_datacallbackfunc)(void *);
+PyAPI_FUNC(int) PyUnstable_AtExit(
+        PyInterpreterState *, atexit_datacallbackfunc, void *);
diff --git a/Include/cpython/pymem.h b/Include/cpython/pymem.h
new file mode 100644
index 0000000000000000000000000000000000000000..76b3221f7b9f39f50fa60ef6b5777eed5e00dc04
--- /dev/null
+++ b/Include/cpython/pymem.h
@@ -0,0 +1,84 @@
+#ifndef Py_CPYTHON_PYMEM_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef enum {
+    /* PyMem_RawMalloc(), PyMem_RawRealloc() and PyMem_RawFree() */
+    PYMEM_DOMAIN_RAW,
+
+    /* PyMem_Malloc(), PyMem_Realloc() and PyMem_Free() */
+    PYMEM_DOMAIN_MEM,
+
+    /* PyObject_Malloc(), PyObject_Realloc() and PyObject_Free() */
+    PYMEM_DOMAIN_OBJ
+} PyMemAllocatorDomain;
+
+typedef enum {
+    PYMEM_ALLOCATOR_NOT_SET = 0,
+    PYMEM_ALLOCATOR_DEFAULT = 1,
+    PYMEM_ALLOCATOR_DEBUG = 2,
+    PYMEM_ALLOCATOR_MALLOC = 3,
+    PYMEM_ALLOCATOR_MALLOC_DEBUG = 4,
+#ifdef WITH_PYMALLOC
+    PYMEM_ALLOCATOR_PYMALLOC = 5,
+    PYMEM_ALLOCATOR_PYMALLOC_DEBUG = 6,
+#endif
+#ifdef WITH_MIMALLOC
+    PYMEM_ALLOCATOR_MIMALLOC = 7,
+    PYMEM_ALLOCATOR_MIMALLOC_DEBUG = 8,
+#endif
+} PyMemAllocatorName;
+
+
+typedef struct {
+    /* user context passed as the first argument to the 4 functions */
+    void *ctx;
+
+    /* allocate a memory block */
+    void* (*malloc) (void *ctx, size_t size);
+
+    /* allocate a memory block initialized by zeros */
+    void* (*calloc) (void *ctx, size_t nelem, size_t elsize);
+
+    /* allocate or resize a memory block */
+    void* (*realloc) (void *ctx, void *ptr, size_t new_size);
+
+    /* release a memory block */
+    void (*free) (void *ctx, void *ptr);
+} PyMemAllocatorEx;
+
+/* Get the memory block allocator of the specified domain. */
+PyAPI_FUNC(void) PyMem_GetAllocator(PyMemAllocatorDomain domain,
+                                    PyMemAllocatorEx *allocator);
+
+/* Set the memory block allocator of the specified domain.
+
+   The new allocator must return a distinct non-NULL pointer when requesting
+   zero bytes.
+
+   For the PYMEM_DOMAIN_RAW domain, the allocator must be thread-safe: the GIL
+   is not held when the allocator is called.
+
+   If the new allocator is not a hook (don't call the previous allocator), the
+   PyMem_SetupDebugHooks() function must be called to reinstall the debug hooks
+   on top on the new allocator. */
+PyAPI_FUNC(void) PyMem_SetAllocator(PyMemAllocatorDomain domain,
+                                    PyMemAllocatorEx *allocator);
+
+/* Setup hooks to detect bugs in the following Python memory allocator
+   functions:
+
+   - PyMem_RawMalloc(), PyMem_RawRealloc(), PyMem_RawFree()
+   - PyMem_Malloc(), PyMem_Realloc(), PyMem_Free()
+   - PyObject_Malloc(), PyObject_Realloc() and PyObject_Free()
+
+   Newly allocated memory is filled with the byte 0xCB, freed memory is filled
+   with the byte 0xDB. Additional checks:
+
+   - detect API violations, ex: PyObject_Free() called on a buffer allocated
+     by PyMem_Malloc()
+   - detect write before the start of the buffer (buffer underflow)
+   - detect write after the end of the buffer (buffer overflow)
+
+   The function does nothing if Python is not compiled is debug mode. */
+PyAPI_FUNC(void) PyMem_SetupDebugHooks(void);
diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h
new file mode 100644
index 0000000000000000000000000000000000000000..f005729fff11b6841b6754e23c15770a87ad8609
--- /dev/null
+++ b/Include/cpython/pystate.h
@@ -0,0 +1,277 @@
+#ifndef Py_CPYTHON_PYSTATE_H
+#  error "this header file must not be included directly"
+#endif
+
+
+/* private interpreter helpers */
+
+PyAPI_FUNC(int) _PyInterpreterState_RequiresIDRef(PyInterpreterState *);
+PyAPI_FUNC(void) _PyInterpreterState_RequireIDRef(PyInterpreterState *, int);
+
+PyAPI_FUNC(PyObject *) PyUnstable_InterpreterState_GetMainModule(PyInterpreterState *);
+
+/* State unique per thread */
+
+/* Py_tracefunc return -1 when raising an exception, or 0 for success. */
+typedef int (*Py_tracefunc)(PyObject *, PyFrameObject *, int, PyObject *);
+
+/* The following values are used for 'what' for tracefunc functions
+ *
+ * To add a new kind of trace event, also update "trace_init" in
+ * Python/sysmodule.c to define the Python level event name
+ */
+#define PyTrace_CALL 0
+#define PyTrace_EXCEPTION 1
+#define PyTrace_LINE 2
+#define PyTrace_RETURN 3
+#define PyTrace_C_CALL 4
+#define PyTrace_C_EXCEPTION 5
+#define PyTrace_C_RETURN 6
+#define PyTrace_OPCODE 7
+
+typedef struct _err_stackitem {
+    /* This struct represents a single execution context where we might
+     * be currently handling an exception.  It is a per-coroutine state
+     * (coroutine in the computer science sense, including the thread
+     * and generators).
+     *
+     * This is used as an entry on the exception stack, where each
+     * entry indicates if it is currently handling an exception.
+     * This ensures that the exception state is not impacted
+     * by "yields" from an except handler.  The thread
+     * always has an entry (the bottom-most one).
+     */
+
+    /* The exception currently being handled in this context, if any. */
+    PyObject *exc_value;
+
+    struct _err_stackitem *previous_item;
+
+} _PyErr_StackItem;
+
+typedef struct _stack_chunk {
+    struct _stack_chunk *previous;
+    size_t size;
+    size_t top;
+    PyObject * data[1]; /* Variable sized */
+} _PyStackChunk;
+
+struct _ts {
+    /* See Python/ceval.c for comments explaining most fields */
+
+    PyThreadState *prev;
+    PyThreadState *next;
+    PyInterpreterState *interp;
+
+    /* The global instrumentation version in high bits, plus flags indicating
+       when to break out of the interpreter loop in lower bits. See details in
+       pycore_ceval.h. */
+    uintptr_t eval_breaker;
+
+    struct {
+        /* Has been initialized to a safe state.
+
+           In order to be effective, this must be set to 0 during or right
+           after allocation. */
+        unsigned int initialized:1;
+
+        /* Has been bound to an OS thread. */
+        unsigned int bound:1;
+        /* Has been unbound from its OS thread. */
+        unsigned int unbound:1;
+        /* Has been bound aa current for the GILState API. */
+        unsigned int bound_gilstate:1;
+        /* Currently in use (maybe holds the GIL). */
+        unsigned int active:1;
+        /* Currently holds the GIL. */
+        unsigned int holds_gil:1;
+
+        /* various stages of finalization */
+        unsigned int finalizing:1;
+        unsigned int cleared:1;
+        unsigned int finalized:1;
+
+        /* padding to align to 4 bytes */
+        unsigned int :23;
+    } _status;
+#ifdef Py_BUILD_CORE
+#  define _PyThreadState_WHENCE_NOTSET -1
+#  define _PyThreadState_WHENCE_UNKNOWN 0
+#  define _PyThreadState_WHENCE_INIT 1
+#  define _PyThreadState_WHENCE_FINI 2
+#  define _PyThreadState_WHENCE_THREADING 3
+#  define _PyThreadState_WHENCE_GILSTATE 4
+#  define _PyThreadState_WHENCE_EXEC 5
+#endif
+    int _whence;
+
+    /* Thread state (_Py_THREAD_ATTACHED, _Py_THREAD_DETACHED, _Py_THREAD_SUSPENDED).
+       See Include/internal/pycore_pystate.h for more details. */
+    int state;
+
+    int py_recursion_remaining;
+    int py_recursion_limit;
+
+    int c_recursion_remaining;
+    int recursion_headroom; /* Allow 50 more calls to handle any errors. */
+
+    /* 'tracing' keeps track of the execution depth when tracing/profiling.
+       This is to prevent the actual trace/profile code from being recorded in
+       the trace/profile. */
+    int tracing;
+    int what_event; /* The event currently being monitored, if any. */
+
+    /* Pointer to currently executing frame. */
+    struct _PyInterpreterFrame *current_frame;
+
+    Py_tracefunc c_profilefunc;
+    Py_tracefunc c_tracefunc;
+    PyObject *c_profileobj;
+    PyObject *c_traceobj;
+
+    /* The exception currently being raised */
+    PyObject *current_exception;
+
+    /* Pointer to the top of the exception stack for the exceptions
+     * we may be currently handling.  (See _PyErr_StackItem above.)
+     * This is never NULL. */
+    _PyErr_StackItem *exc_info;
+
+    PyObject *dict;  /* Stores per-thread state */
+
+    int gilstate_counter;
+
+    PyObject *async_exc; /* Asynchronous exception to raise */
+    unsigned long thread_id; /* Thread id where this tstate was created */
+
+    /* Native thread id where this tstate was created. This will be 0 except on
+     * those platforms that have the notion of native thread id, for which the
+     * macro PY_HAVE_THREAD_NATIVE_ID is then defined.
+     */
+    unsigned long native_thread_id;
+
+    PyObject *delete_later;
+
+    /* Tagged pointer to top-most critical section, or zero if there is no
+     * active critical section. Critical sections are only used in
+     * `--disable-gil` builds (i.e., when Py_GIL_DISABLED is defined to 1). In the
+     * default build, this field is always zero.
+     */
+    uintptr_t critical_section;
+
+    int coroutine_origin_tracking_depth;
+
+    PyObject *async_gen_firstiter;
+    PyObject *async_gen_finalizer;
+
+    PyObject *context;
+    uint64_t context_ver;
+
+    /* Unique thread state id. */
+    uint64_t id;
+
+    _PyStackChunk *datastack_chunk;
+    PyObject **datastack_top;
+    PyObject **datastack_limit;
+    /* XXX signal handlers should also be here */
+
+    /* The following fields are here to avoid allocation during init.
+       The data is exposed through PyThreadState pointer fields.
+       These fields should not be accessed directly outside of init.
+       This is indicated by an underscore prefix on the field names.
+
+       All other PyInterpreterState pointer fields are populated when
+       needed and default to NULL.
+       */
+       // Note some fields do not have a leading underscore for backward
+       // compatibility.  See https://bugs.python.org/issue45953#msg412046.
+
+    /* The thread's exception stack entry.  (Always the last entry.) */
+    _PyErr_StackItem exc_state;
+
+    PyObject *previous_executor;
+
+    uint64_t dict_global_version;
+
+    /* Used to store/retrieve `threading.local` keys/values for this thread */
+    PyObject *threading_local_key;
+
+    /* Used by `threading.local`s to be remove keys/values for dying threads.
+       The PyThreadObject must hold the only reference to this value.
+    */
+    PyObject *threading_local_sentinel;
+};
+
+#ifdef Py_DEBUG
+   // A debug build is likely built with low optimization level which implies
+   // higher stack memory usage than a release build: use a lower limit.
+#  define Py_C_RECURSION_LIMIT 500
+#elif defined(__s390x__)
+#  define Py_C_RECURSION_LIMIT 800
+#elif defined(_WIN32) && defined(_M_ARM64)
+#  define Py_C_RECURSION_LIMIT 1000
+#elif defined(_WIN32)
+#  define Py_C_RECURSION_LIMIT 3000
+#elif defined(__ANDROID__)
+   // On an ARM64 emulator, API level 34 was OK with 10000, but API level 21
+   // crashed in test_compiler_recursion_limit.
+#  define Py_C_RECURSION_LIMIT 3000
+#elif defined(_Py_ADDRESS_SANITIZER)
+#  define Py_C_RECURSION_LIMIT 4000
+#elif defined(__wasi__)
+   // Based on wasmtime 16.
+#  define Py_C_RECURSION_LIMIT 5000
+#else
+   // This value is duplicated in Lib/test/support/__init__.py
+#  define Py_C_RECURSION_LIMIT 10000
+#endif
+
+
+/* other API */
+
+/* Similar to PyThreadState_Get(), but don't issue a fatal error
+ * if it is NULL. */
+PyAPI_FUNC(PyThreadState *) PyThreadState_GetUnchecked(void);
+
+// Alias kept for backward compatibility
+#define _PyThreadState_UncheckedGet PyThreadState_GetUnchecked
+
+
+// Disable tracing and profiling.
+PyAPI_FUNC(void) PyThreadState_EnterTracing(PyThreadState *tstate);
+
+// Reset tracing and profiling: enable them if a trace function or a profile
+// function is set, otherwise disable them.
+PyAPI_FUNC(void) PyThreadState_LeaveTracing(PyThreadState *tstate);
+
+/* PyGILState */
+
+/* Helper/diagnostic function - return 1 if the current thread
+   currently holds the GIL, 0 otherwise.
+
+   The function returns 1 if _PyGILState_check_enabled is non-zero. */
+PyAPI_FUNC(int) PyGILState_Check(void);
+
+/* The implementation of sys._current_frames()  Returns a dict mapping
+   thread id to that thread's current frame.
+*/
+PyAPI_FUNC(PyObject*) _PyThread_CurrentFrames(void);
+
+/* Routines for advanced debuggers, requested by David Beazley.
+   Don't use unless you know what you are doing! */
+PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_Main(void);
+PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_Head(void);
+PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_Next(PyInterpreterState *);
+PyAPI_FUNC(PyThreadState *) PyInterpreterState_ThreadHead(PyInterpreterState *);
+PyAPI_FUNC(PyThreadState *) PyThreadState_Next(PyThreadState *);
+PyAPI_FUNC(void) PyThreadState_DeleteCurrent(void);
+
+/* Frame evaluation API */
+
+typedef PyObject* (*_PyFrameEvalFunction)(PyThreadState *tstate, struct _PyInterpreterFrame *, int);
+
+PyAPI_FUNC(_PyFrameEvalFunction) _PyInterpreterState_GetEvalFrameFunc(
+    PyInterpreterState *interp);
+PyAPI_FUNC(void) _PyInterpreterState_SetEvalFrameFunc(
+    PyInterpreterState *interp,
+    _PyFrameEvalFunction eval_frame);
diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h
new file mode 100644
index 0000000000000000000000000000000000000000..378c2760ec3f5577909f2e489b9adaaad1a63390
--- /dev/null
+++ b/Include/cpython/pystats.h
@@ -0,0 +1,175 @@
+// Statistics on Python performance.
+//
+// API:
+//
+// - _Py_INCREF_STAT_INC() and _Py_DECREF_STAT_INC() used by Py_INCREF()
+//   and Py_DECREF().
+// - _Py_stats variable
+//
+// Functions of the sys module:
+//
+// - sys._stats_on()
+// - sys._stats_off()
+// - sys._stats_clear()
+// - sys._stats_dump()
+//
+// Python must be built with ./configure --enable-pystats to define the
+// Py_STATS macro.
+//
+// Define _PY_INTERPRETER macro to increment interpreter_increfs and
+// interpreter_decrefs. Otherwise, increment increfs and decrefs.
+//
+// The number of incref operations counted by `incref` and
+// `interpreter_incref` is the number of increment operations, which is
+// not equal to the total of all reference counts. A single increment
+// operation may increase the reference count of an object by more than
+// one. For example, see `_Py_RefcntAdd`.
+
+#ifndef Py_CPYTHON_PYSTATS_H
+#  error "this header file must not be included directly"
+#endif
+
+#define PYSTATS_MAX_UOP_ID 512
+
+#define SPECIALIZATION_FAILURE_KINDS 36
+
+/* Stats for determining who is calling PyEval_EvalFrame */
+#define EVAL_CALL_TOTAL 0
+#define EVAL_CALL_VECTOR 1
+#define EVAL_CALL_GENERATOR 2
+#define EVAL_CALL_LEGACY 3
+#define EVAL_CALL_FUNCTION_VECTORCALL 4
+#define EVAL_CALL_BUILD_CLASS 5
+#define EVAL_CALL_SLOT 6
+#define EVAL_CALL_FUNCTION_EX 7
+#define EVAL_CALL_API 8
+#define EVAL_CALL_METHOD 9
+
+#define EVAL_CALL_KINDS 10
+
+typedef struct _specialization_stats {
+    uint64_t success;
+    uint64_t failure;
+    uint64_t hit;
+    uint64_t deferred;
+    uint64_t miss;
+    uint64_t deopt;
+    uint64_t failure_kinds[SPECIALIZATION_FAILURE_KINDS];
+} SpecializationStats;
+
+typedef struct _opcode_stats {
+    SpecializationStats specialization;
+    uint64_t execution_count;
+    uint64_t pair_count[256];
+} OpcodeStats;
+
+typedef struct _call_stats {
+    uint64_t inlined_py_calls;
+    uint64_t pyeval_calls;
+    uint64_t frames_pushed;
+    uint64_t frame_objects_created;
+    uint64_t eval_calls[EVAL_CALL_KINDS];
+} CallStats;
+
+typedef struct _object_stats {
+    uint64_t increfs;
+    uint64_t decrefs;
+    uint64_t interpreter_increfs;
+    uint64_t interpreter_decrefs;
+    uint64_t allocations;
+    uint64_t allocations512;
+    uint64_t allocations4k;
+    uint64_t allocations_big;
+    uint64_t frees;
+    uint64_t to_freelist;
+    uint64_t from_freelist;
+    uint64_t inline_values;
+    uint64_t dict_materialized_on_request;
+    uint64_t dict_materialized_new_key;
+    uint64_t dict_materialized_too_big;
+    uint64_t dict_materialized_str_subclass;
+    uint64_t type_cache_hits;
+    uint64_t type_cache_misses;
+    uint64_t type_cache_dunder_hits;
+    uint64_t type_cache_dunder_misses;
+    uint64_t type_cache_collisions;
+    /* Temporary value used during GC */
+    uint64_t object_visits;
+} ObjectStats;
+
+typedef struct _gc_stats {
+    uint64_t collections;
+    uint64_t object_visits;
+    uint64_t objects_collected;
+} GCStats;
+
+typedef struct _uop_stats {
+    uint64_t execution_count;
+    uint64_t miss;
+    uint64_t pair_count[PYSTATS_MAX_UOP_ID + 1];
+} UOpStats;
+
+#define _Py_UOP_HIST_SIZE 32
+
+typedef struct _optimization_stats {
+    uint64_t attempts;
+    uint64_t traces_created;
+    uint64_t traces_executed;
+    uint64_t uops_executed;
+    uint64_t trace_stack_overflow;
+    uint64_t trace_stack_underflow;
+    uint64_t trace_too_long;
+    uint64_t trace_too_short;
+    uint64_t inner_loop;
+    uint64_t recursive_call;
+    uint64_t low_confidence;
+    uint64_t executors_invalidated;
+    UOpStats opcode[PYSTATS_MAX_UOP_ID + 1];
+    uint64_t unsupported_opcode[256];
+    uint64_t trace_length_hist[_Py_UOP_HIST_SIZE];
+    uint64_t trace_run_length_hist[_Py_UOP_HIST_SIZE];
+    uint64_t optimized_trace_length_hist[_Py_UOP_HIST_SIZE];
+    uint64_t optimizer_attempts;
+    uint64_t optimizer_successes;
+    uint64_t optimizer_failure_reason_no_memory;
+    uint64_t remove_globals_builtins_changed;
+    uint64_t remove_globals_incorrect_keys;
+    uint64_t error_in_opcode[PYSTATS_MAX_UOP_ID + 1];
+} OptimizationStats;
+
+typedef struct _rare_event_stats {
+    /* Setting an object's class, obj.__class__ = ... */
+    uint64_t set_class;
+    /* Setting the bases of a class, cls.__bases__ = ... */
+    uint64_t set_bases;
+    /* Setting the PEP 523 frame eval function, _PyInterpreterState_SetFrameEvalFunc() */
+    uint64_t set_eval_frame_func;
+    /* Modifying the builtins,  __builtins__.__dict__[var] = ... */
+    uint64_t builtin_dict;
+    /* Modifying a function, e.g. func.__defaults__ = ..., etc. */
+    uint64_t func_modification;
+    /* Modifying a dict that is being watched */
+    uint64_t watched_dict_modification;
+    uint64_t watched_globals_modification;
+} RareEventStats;
+
+typedef struct _stats {
+    OpcodeStats opcode_stats[256];
+    CallStats call_stats;
+    ObjectStats object_stats;
+    OptimizationStats optimization_stats;
+    RareEventStats rare_event_stats;
+    GCStats *gc_stats;
+} PyStats;
+
+
+// Export for shared extensions like 'math'
+PyAPI_DATA(PyStats*) _Py_stats;
+
+#ifdef _PY_INTERPRETER
+#  define _Py_INCREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.interpreter_increfs++; } while (0)
+#  define _Py_DECREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.interpreter_decrefs++; } while (0)
+#else
+#  define _Py_INCREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.increfs++; } while (0)
+#  define _Py_DECREF_STAT_INC() do { if (_Py_stats) _Py_stats->object_stats.decrefs++; } while (0)
+#endif
diff --git a/Include/cpython/pythonrun.h b/Include/cpython/pythonrun.h
new file mode 100644
index 0000000000000000000000000000000000000000..edc40952254029f8c8f2927f4e3324a14d40fba6
--- /dev/null
+++ b/Include/cpython/pythonrun.h
@@ -0,0 +1,96 @@
+#ifndef Py_CPYTHON_PYTHONRUN_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(int) PyRun_SimpleStringFlags(const char *, PyCompilerFlags *);
+PyAPI_FUNC(int) PyRun_AnyFileExFlags(
+    FILE *fp,
+    const char *filename,       /* decoded from the filesystem encoding */
+    int closeit,
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) PyRun_SimpleFileExFlags(
+    FILE *fp,
+    const char *filename,       /* decoded from the filesystem encoding */
+    int closeit,
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) PyRun_InteractiveOneFlags(
+    FILE *fp,
+    const char *filename,       /* decoded from the filesystem encoding */
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) PyRun_InteractiveOneObject(
+    FILE *fp,
+    PyObject *filename,
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) PyRun_InteractiveLoopFlags(
+    FILE *fp,
+    const char *filename,       /* decoded from the filesystem encoding */
+    PyCompilerFlags *flags);
+
+
+PyAPI_FUNC(PyObject *) PyRun_StringFlags(const char *, int, PyObject *,
+                                         PyObject *, PyCompilerFlags *);
+
+PyAPI_FUNC(PyObject *) PyRun_FileExFlags(
+    FILE *fp,
+    const char *filename,       /* decoded from the filesystem encoding */
+    int start,
+    PyObject *globals,
+    PyObject *locals,
+    int closeit,
+    PyCompilerFlags *flags);
+
+
+PyAPI_FUNC(PyObject *) Py_CompileStringExFlags(
+    const char *str,
+    const char *filename,       /* decoded from the filesystem encoding */
+    int start,
+    PyCompilerFlags *flags,
+    int optimize);
+PyAPI_FUNC(PyObject *) Py_CompileStringObject(
+    const char *str,
+    PyObject *filename, int start,
+    PyCompilerFlags *flags,
+    int optimize);
+
+#define Py_CompileString(str, p, s) Py_CompileStringExFlags((str), (p), (s), NULL, -1)
+#define Py_CompileStringFlags(str, p, s, f) Py_CompileStringExFlags((str), (p), (s), (f), -1)
+
+/* A function flavor is also exported by libpython. It is required when
+    libpython is accessed directly rather than using header files which defines
+    macros below. On Windows, for example, PyAPI_FUNC() uses dllexport to
+    export functions in pythonXX.dll. */
+PyAPI_FUNC(PyObject *) PyRun_String(const char *str, int s, PyObject *g, PyObject *l);
+PyAPI_FUNC(int) PyRun_AnyFile(FILE *fp, const char *name);
+PyAPI_FUNC(int) PyRun_AnyFileEx(FILE *fp, const char *name, int closeit);
+PyAPI_FUNC(int) PyRun_AnyFileFlags(FILE *, const char *, PyCompilerFlags *);
+PyAPI_FUNC(int) PyRun_SimpleString(const char *s);
+PyAPI_FUNC(int) PyRun_SimpleFile(FILE *f, const char *p);
+PyAPI_FUNC(int) PyRun_SimpleFileEx(FILE *f, const char *p, int c);
+PyAPI_FUNC(int) PyRun_InteractiveOne(FILE *f, const char *p);
+PyAPI_FUNC(int) PyRun_InteractiveLoop(FILE *f, const char *p);
+PyAPI_FUNC(PyObject *) PyRun_File(FILE *fp, const char *p, int s, PyObject *g, PyObject *l);
+PyAPI_FUNC(PyObject *) PyRun_FileEx(FILE *fp, const char *p, int s, PyObject *g, PyObject *l, int c);
+PyAPI_FUNC(PyObject *) PyRun_FileFlags(FILE *fp, const char *p, int s, PyObject *g, PyObject *l, PyCompilerFlags *flags);
+
+/* Use macros for a bunch of old variants */
+#define PyRun_String(str, s, g, l) PyRun_StringFlags((str), (s), (g), (l), NULL)
+#define PyRun_AnyFile(fp, name) PyRun_AnyFileExFlags((fp), (name), 0, NULL)
+#define PyRun_AnyFileEx(fp, name, closeit) \
+    PyRun_AnyFileExFlags((fp), (name), (closeit), NULL)
+#define PyRun_AnyFileFlags(fp, name, flags) \
+    PyRun_AnyFileExFlags((fp), (name), 0, (flags))
+#define PyRun_SimpleString(s) PyRun_SimpleStringFlags((s), NULL)
+#define PyRun_SimpleFile(f, p) PyRun_SimpleFileExFlags((f), (p), 0, NULL)
+#define PyRun_SimpleFileEx(f, p, c) PyRun_SimpleFileExFlags((f), (p), (c), NULL)
+#define PyRun_InteractiveOne(f, p) PyRun_InteractiveOneFlags((f), (p), NULL)
+#define PyRun_InteractiveLoop(f, p) PyRun_InteractiveLoopFlags((f), (p), NULL)
+#define PyRun_File(fp, p, s, g, l) \
+    PyRun_FileExFlags((fp), (p), (s), (g), (l), 0, NULL)
+#define PyRun_FileEx(fp, p, s, g, l, c) \
+    PyRun_FileExFlags((fp), (p), (s), (g), (l), (c), NULL)
+#define PyRun_FileFlags(fp, p, s, g, l, flags) \
+    PyRun_FileExFlags((fp), (p), (s), (g), (l), 0, (flags))
+
+/* Stuff with no proper home (yet) */
+PyAPI_FUNC(char *) PyOS_Readline(FILE *, FILE *, const char *);
+PyAPI_DATA(char) *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, const char *);
diff --git a/Include/cpython/pythread.h b/Include/cpython/pythread.h
new file mode 100644
index 0000000000000000000000000000000000000000..e658b35bd90700eb1dabad03497c2b52452db9ff
--- /dev/null
+++ b/Include/cpython/pythread.h
@@ -0,0 +1,43 @@
+#ifndef Py_CPYTHON_PYTHREAD_H
+#  error "this header file must not be included directly"
+#endif
+
+// PY_TIMEOUT_MAX is the highest usable value (in microseconds) of PY_TIMEOUT_T
+// type, and depends on the system threading API.
+//
+// NOTE: this isn't the same value as `_thread.TIMEOUT_MAX`. The _thread module
+// exposes a higher-level API, with timeouts expressed in seconds and
+// floating-point numbers allowed.
+PyAPI_DATA(const long long) PY_TIMEOUT_MAX;
+
+#define PYTHREAD_INVALID_THREAD_ID ((unsigned long)-1)
+
+#ifdef HAVE_PTHREAD_H
+    /* Darwin needs pthread.h to know type name the pthread_key_t. */
+#   include <pthread.h>
+#   define NATIVE_TSS_KEY_T     pthread_key_t
+#elif defined(NT_THREADS)
+    /* In Windows, native TSS key type is DWORD,
+       but hardcode the unsigned long to avoid errors for include directive.
+    */
+#   define NATIVE_TSS_KEY_T     unsigned long
+#elif defined(HAVE_PTHREAD_STUBS)
+#   include "pthread_stubs.h"
+#   define NATIVE_TSS_KEY_T     pthread_key_t
+#else
+#   error "Require native threads. See https://bugs.python.org/issue31370"
+#endif
+
+/* When Py_LIMITED_API is not defined, the type layout of Py_tss_t is
+   exposed to allow static allocation in the API clients.  Even in this case,
+   you must handle TSS keys through API functions due to compatibility.
+*/
+struct _Py_tss_t {
+    int _is_initialized;
+    NATIVE_TSS_KEY_T _key;
+};
+
+#undef NATIVE_TSS_KEY_T
+
+/* When static allocation, you must initialize with Py_tss_NEEDS_INIT. */
+#define Py_tss_NEEDS_INIT   {0}
diff --git a/Include/cpython/pytime.h b/Include/cpython/pytime.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c68110aeedb867ba90c8cab047fed1489e083ff
--- /dev/null
+++ b/Include/cpython/pytime.h
@@ -0,0 +1,27 @@
+// PyTime_t C API: see Doc/c-api/time.rst for the documentation.
+
+#ifndef Py_LIMITED_API
+#ifndef Py_PYTIME_H
+#define Py_PYTIME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int64_t PyTime_t;
+#define PyTime_MIN INT64_MIN
+#define PyTime_MAX INT64_MAX
+
+PyAPI_FUNC(double) PyTime_AsSecondsDouble(PyTime_t t);
+PyAPI_FUNC(int) PyTime_Monotonic(PyTime_t *result);
+PyAPI_FUNC(int) PyTime_PerfCounter(PyTime_t *result);
+PyAPI_FUNC(int) PyTime_Time(PyTime_t *result);
+
+PyAPI_FUNC(int) PyTime_MonotonicRaw(PyTime_t *result);
+PyAPI_FUNC(int) PyTime_PerfCounterRaw(PyTime_t *result);
+PyAPI_FUNC(int) PyTime_TimeRaw(PyTime_t *result);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* Py_PYTIME_H */
+#endif /* Py_LIMITED_API */
diff --git a/Include/cpython/setobject.h b/Include/cpython/setobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..89565cb29212fc172d82e2b5cf52295dcd3a6184
--- /dev/null
+++ b/Include/cpython/setobject.h
@@ -0,0 +1,71 @@
+#ifndef Py_CPYTHON_SETOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* There are three kinds of entries in the table:
+
+1. Unused:  key == NULL and hash == 0
+2. Dummy:   key == dummy and hash == -1
+3. Active:  key != NULL and key != dummy and hash != -1
+
+The hash field of Unused slots is always zero.
+
+The hash field of Dummy slots are set to -1
+meaning that dummy entries can be detected by
+either entry->key==dummy or by entry->hash==-1.
+*/
+
+#define PySet_MINSIZE 8
+
+typedef struct {
+    PyObject *key;
+    Py_hash_t hash;             /* Cached hash code of the key */
+} setentry;
+
+/* The SetObject data structure is shared by set and frozenset objects.
+
+Invariant for sets:
+ - hash is -1
+
+Invariants for frozensets:
+ - data is immutable.
+ - hash is the hash of the frozenset or -1 if not computed yet.
+
+*/
+
+typedef struct {
+    PyObject_HEAD
+
+    Py_ssize_t fill;            /* Number active and dummy entries*/
+    Py_ssize_t used;            /* Number active entries */
+
+    /* The table contains mask + 1 slots, and that's a power of 2.
+     * We store the mask instead of the size because the mask is more
+     * frequently needed.
+     */
+    Py_ssize_t mask;
+
+    /* The table points to a fixed-size smalltable for small tables
+     * or to additional malloc'ed memory for bigger tables.
+     * The table pointer is never NULL which saves us from repeated
+     * runtime null-tests.
+     */
+    setentry *table;
+    Py_hash_t hash;             /* Only used by frozenset objects */
+    Py_ssize_t finger;          /* Search finger for pop() */
+
+    setentry smalltable[PySet_MINSIZE];
+    PyObject *weakreflist;      /* List of weak references */
+} PySetObject;
+
+#define _PySet_CAST(so) \
+    (assert(PyAnySet_Check(so)), _Py_CAST(PySetObject*, so))
+
+static inline Py_ssize_t PySet_GET_SIZE(PyObject *so) {
+#ifdef Py_GIL_DISABLED
+    return _Py_atomic_load_ssize_relaxed(&(_PySet_CAST(so)->used));
+#else
+    return _PySet_CAST(so)->used;
+#endif
+}
+#define PySet_GET_SIZE(so) PySet_GET_SIZE(_PyObject_CAST(so))
diff --git a/Include/cpython/sysmodule.h b/Include/cpython/sysmodule.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3ac07f538a94f32892d8d98872b8cbced812063
--- /dev/null
+++ b/Include/cpython/sysmodule.h
@@ -0,0 +1,22 @@
+#ifndef Py_CPYTHON_SYSMODULE_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef int(*Py_AuditHookFunction)(const char *, PyObject *, void *);
+
+PyAPI_FUNC(int) PySys_AddAuditHook(Py_AuditHookFunction, void*);
+
+typedef struct {
+    FILE* perf_map;
+    PyThread_type_lock map_lock;
+} PerfMapState;
+
+PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void);
+PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry(
+    const void *code_addr,
+    unsigned int code_size,
+    const char *entry_name);
+PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void);
+PyAPI_FUNC(int) PyUnstable_CopyPerfMapFile(const char* parent_filename);
+PyAPI_FUNC(int) PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *);
+PyAPI_FUNC(int) PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable);
diff --git a/Include/cpython/traceback.h b/Include/cpython/traceback.h
new file mode 100644
index 0000000000000000000000000000000000000000..81c51944f136f29396699dbcf8a3ddbd5c21b75a
--- /dev/null
+++ b/Include/cpython/traceback.h
@@ -0,0 +1,13 @@
+#ifndef Py_CPYTHON_TRACEBACK_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct _traceback PyTracebackObject;
+
+struct _traceback {
+    PyObject_HEAD
+    PyTracebackObject *tb_next;
+    PyFrameObject *tb_frame;
+    int tb_lasti;
+    int tb_lineno;
+};
diff --git a/Include/cpython/tracemalloc.h b/Include/cpython/tracemalloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d094291ae2e906451675b94836919b63a7c29f0
--- /dev/null
+++ b/Include/cpython/tracemalloc.h
@@ -0,0 +1,32 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_TRACEMALLOC_H
+#define Py_TRACEMALLOC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Track an allocated memory block in the tracemalloc module.
+   Return 0 on success, return -1 on error (failed to allocate memory to store
+   the trace).
+
+   Return -2 if tracemalloc is disabled.
+
+   If memory block is already tracked, update the existing trace. */
+PyAPI_FUNC(int) PyTraceMalloc_Track(
+    unsigned int domain,
+    uintptr_t ptr,
+    size_t size);
+
+/* Untrack an allocated memory block in the tracemalloc module.
+   Do nothing if the block was not tracked.
+
+   Return -2 if tracemalloc is disabled, otherwise return 0. */
+PyAPI_FUNC(int) PyTraceMalloc_Untrack(
+    unsigned int domain,
+    uintptr_t ptr);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_TRACEMALLOC_H
+#endif  // !Py_LIMITED_API
diff --git a/Include/cpython/tupleobject.h b/Include/cpython/tupleobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..e530c8beda44ab3b409948001085d90c21709312
--- /dev/null
+++ b/Include/cpython/tupleobject.h
@@ -0,0 +1,38 @@
+#ifndef Py_CPYTHON_TUPLEOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct {
+    PyObject_VAR_HEAD
+    /* ob_item contains space for 'ob_size' elements.
+       Items must normally not be NULL, except during construction when
+       the tuple is not yet visible outside the function that builds it. */
+    PyObject *ob_item[1];
+} PyTupleObject;
+
+PyAPI_FUNC(int) _PyTuple_Resize(PyObject **, Py_ssize_t);
+
+/* Cast argument to PyTupleObject* type. */
+#define _PyTuple_CAST(op) \
+    (assert(PyTuple_Check(op)), _Py_CAST(PyTupleObject*, (op)))
+
+// Macros and static inline functions, trading safety for speed
+
+static inline Py_ssize_t PyTuple_GET_SIZE(PyObject *op) {
+    PyTupleObject *tuple = _PyTuple_CAST(op);
+    return Py_SIZE(tuple);
+}
+#define PyTuple_GET_SIZE(op) PyTuple_GET_SIZE(_PyObject_CAST(op))
+
+#define PyTuple_GET_ITEM(op, index) (_PyTuple_CAST(op)->ob_item[(index)])
+
+/* Function *only* to be used to fill in brand new tuples */
+static inline void
+PyTuple_SET_ITEM(PyObject *op, Py_ssize_t index, PyObject *value) {
+    PyTupleObject *tuple = _PyTuple_CAST(op);
+    assert(0 <= index);
+    assert(index < Py_SIZE(tuple));
+    tuple->ob_item[index] = value;
+}
+#define PyTuple_SET_ITEM(op, index, value) \
+    PyTuple_SET_ITEM(_PyObject_CAST(op), (index), _PyObject_CAST(value))
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9b54bce83202daf4297e5009caafcf0e22eb018
--- /dev/null
+++ b/Include/cpython/unicodeobject.h
@@ -0,0 +1,703 @@
+#ifndef Py_CPYTHON_UNICODEOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Py_UNICODE was the native Unicode storage format (code unit) used by
+   Python and represents a single Unicode element in the Unicode type.
+   With PEP 393, Py_UNICODE is deprecated and replaced with a
+   typedef to wchar_t. */
+Py_DEPRECATED(3.13) typedef wchar_t PY_UNICODE_TYPE;
+Py_DEPRECATED(3.13) typedef wchar_t Py_UNICODE;
+
+
+/* --- Internal Unicode Operations ---------------------------------------- */
+
+// Static inline functions to work with surrogates
+static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
+    return (0xD800 <= ch && ch <= 0xDFFF);
+}
+static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
+    return (0xD800 <= ch && ch <= 0xDBFF);
+}
+static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
+    return (0xDC00 <= ch && ch <= 0xDFFF);
+}
+
+// Join two surrogate characters and return a single Py_UCS4 value.
+static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low)  {
+    assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+    assert(Py_UNICODE_IS_LOW_SURROGATE(low));
+    return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
+}
+
+// High surrogate = top 10 bits added to 0xD800.
+// The character must be in the range [U+10000; U+10ffff].
+static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
+    assert(0x10000 <= ch && ch <= 0x10ffff);
+    return (0xD800 - (0x10000 >> 10) + (ch >> 10));
+}
+
+// Low surrogate = bottom 10 bits added to 0xDC00.
+// The character must be in the range [U+10000; U+10ffff].
+static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
+    assert(0x10000 <= ch && ch <= 0x10ffff);
+    return (0xDC00 + (ch & 0x3FF));
+}
+
+
+/* --- Unicode Type ------------------------------------------------------- */
+
+/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
+   structure. state.ascii and state.compact are set, and the data
+   immediately follow the structure. utf8_length can be found
+   in the length field; the utf8 pointer is equal to the data pointer. */
+typedef struct {
+    /* There are 4 forms of Unicode strings:
+
+       - compact ascii:
+
+         * structure = PyASCIIObject
+         * test: PyUnicode_IS_COMPACT_ASCII(op)
+         * kind = PyUnicode_1BYTE_KIND
+         * compact = 1
+         * ascii = 1
+         * (length is the length of the utf8)
+         * (data starts just after the structure)
+         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
+
+       - compact:
+
+         * structure = PyCompactUnicodeObject
+         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
+         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
+           PyUnicode_4BYTE_KIND
+         * compact = 1
+         * ascii = 0
+         * utf8 is not shared with data
+         * utf8_length = 0 if utf8 is NULL
+         * (data starts just after the structure)
+
+       - legacy string:
+
+         * structure = PyUnicodeObject structure
+         * test: !PyUnicode_IS_COMPACT(op)
+         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
+           PyUnicode_4BYTE_KIND
+         * compact = 0
+         * data.any is not NULL
+         * utf8 is shared and utf8_length = length with data.any if ascii = 1
+         * utf8_length = 0 if utf8 is NULL
+
+       Compact strings use only one memory block (structure + characters),
+       whereas legacy strings use one block for the structure and one block
+       for characters.
+
+       Legacy strings are created by subclasses of Unicode.
+
+       See also _PyUnicode_CheckConsistency().
+    */
+    PyObject_HEAD
+    Py_ssize_t length;          /* Number of code points in the string */
+    Py_hash_t hash;             /* Hash value; -1 if not set */
+    struct {
+        /* If interned is non-zero, the two references from the
+           dictionary to this object are *not* counted in ob_refcnt.
+           The possible values here are:
+               0: Not Interned
+               1: Interned
+               2: Interned and Immortal
+               3: Interned, Immortal, and Static
+           This categorization allows the runtime to determine the right
+           cleanup mechanism at runtime shutdown. */
+        unsigned int interned:2;
+        /* Character size:
+
+           - PyUnicode_1BYTE_KIND (1):
+
+             * character type = Py_UCS1 (8 bits, unsigned)
+             * all characters are in the range U+0000-U+00FF (latin1)
+             * if ascii is set, all characters are in the range U+0000-U+007F
+               (ASCII), otherwise at least one character is in the range
+               U+0080-U+00FF
+
+           - PyUnicode_2BYTE_KIND (2):
+
+             * character type = Py_UCS2 (16 bits, unsigned)
+             * all characters are in the range U+0000-U+FFFF (BMP)
+             * at least one character is in the range U+0100-U+FFFF
+
+           - PyUnicode_4BYTE_KIND (4):
+
+             * character type = Py_UCS4 (32 bits, unsigned)
+             * all characters are in the range U+0000-U+10FFFF
+             * at least one character is in the range U+10000-U+10FFFF
+         */
+        unsigned int kind:3;
+        /* Compact is with respect to the allocation scheme. Compact unicode
+           objects only require one memory block while non-compact objects use
+           one block for the PyUnicodeObject struct and another for its data
+           buffer. */
+        unsigned int compact:1;
+        /* The string only contains characters in the range U+0000-U+007F (ASCII)
+           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
+           set, use the PyASCIIObject structure. */
+        unsigned int ascii:1;
+        /* The object is statically allocated. */
+        unsigned int statically_allocated:1;
+        /* Padding to ensure that PyUnicode_DATA() is always aligned to
+           4 bytes (see issue #19537 on m68k). */
+        unsigned int :24;
+    } state;
+} PyASCIIObject;
+
+/* Non-ASCII strings allocated through PyUnicode_New use the
+   PyCompactUnicodeObject structure. state.compact is set, and the data
+   immediately follow the structure. */
+typedef struct {
+    PyASCIIObject _base;
+    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
+                                 * terminating \0. */
+    char *utf8;                 /* UTF-8 representation (null-terminated) */
+} PyCompactUnicodeObject;
+
+/* Object format for Unicode subclasses. */
+typedef struct {
+    PyCompactUnicodeObject _base;
+    union {
+        void *any;
+        Py_UCS1 *latin1;
+        Py_UCS2 *ucs2;
+        Py_UCS4 *ucs4;
+    } data;                     /* Canonical, smallest-form Unicode buffer */
+} PyUnicodeObject;
+
+
+#define _PyASCIIObject_CAST(op) \
+    (assert(PyUnicode_Check(op)), \
+     _Py_CAST(PyASCIIObject*, (op)))
+#define _PyCompactUnicodeObject_CAST(op) \
+    (assert(PyUnicode_Check(op)), \
+     _Py_CAST(PyCompactUnicodeObject*, (op)))
+#define _PyUnicodeObject_CAST(op) \
+    (assert(PyUnicode_Check(op)), \
+     _Py_CAST(PyUnicodeObject*, (op)))
+
+
+/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
+
+/* Values for PyASCIIObject.state: */
+
+/* Interning state. */
+#define SSTATE_NOT_INTERNED 0
+#define SSTATE_INTERNED_MORTAL 1
+#define SSTATE_INTERNED_IMMORTAL 2
+#define SSTATE_INTERNED_IMMORTAL_STATIC 3
+
+/* Use only if you know it's a string */
+static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
+    return _PyASCIIObject_CAST(op)->state.interned;
+}
+#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
+
+/* For backward compatibility */
+static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
+    return 1;
+}
+#define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
+
+/* Return true if the string contains only ASCII characters, or 0 if not. The
+   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
+   ready. */
+static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
+    return _PyASCIIObject_CAST(op)->state.ascii;
+}
+#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
+
+/* Return true if the string is compact or 0 if not.
+   No type checks or Ready calls are performed. */
+static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
+    return _PyASCIIObject_CAST(op)->state.compact;
+}
+#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
+
+/* Return true if the string is a compact ASCII string (use PyASCIIObject
+   structure), or 0 if not.  No type checks or Ready calls are performed. */
+static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
+    return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
+}
+#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
+
+enum PyUnicode_Kind {
+/* Return values of the PyUnicode_KIND() function: */
+    PyUnicode_1BYTE_KIND = 1,
+    PyUnicode_2BYTE_KIND = 2,
+    PyUnicode_4BYTE_KIND = 4
+};
+
+// PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
+//
+// gh-89653: Converting this macro to a static inline function would introduce
+// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
+// unsigned numbers) where kind type is an int or on
+// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
+#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
+
+/* Return a void pointer to the raw unicode buffer. */
+static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
+    if (PyUnicode_IS_ASCII(op)) {
+        return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
+    }
+    return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
+}
+
+static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
+    void *data;
+    assert(!PyUnicode_IS_COMPACT(op));
+    data = _PyUnicodeObject_CAST(op)->data.any;
+    assert(data != NULL);
+    return data;
+}
+
+static inline void* PyUnicode_DATA(PyObject *op) {
+    if (PyUnicode_IS_COMPACT(op)) {
+        return _PyUnicode_COMPACT_DATA(op);
+    }
+    return _PyUnicode_NONCOMPACT_DATA(op);
+}
+#define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
+
+/* Return pointers to the canonical representation cast to unsigned char,
+   Py_UCS2, or Py_UCS4 for direct character access.
+   No checks are performed, use PyUnicode_KIND() before to ensure
+   these will work correctly. */
+
+#define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
+#define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
+#define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
+
+/* Returns the length of the unicode string. */
+static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
+    return _PyASCIIObject_CAST(op)->length;
+}
+#define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
+
+/* Write into the canonical representation, this function does not do any sanity
+   checks and is intended for usage in loops.  The caller should cache the
+   kind and data pointers obtained from other function calls.
+   index is the index in the string (starts at 0) and value is the new
+   code point value which should be written to that location. */
+static inline void PyUnicode_WRITE(int kind, void *data,
+                                   Py_ssize_t index, Py_UCS4 value)
+{
+    assert(index >= 0);
+    if (kind == PyUnicode_1BYTE_KIND) {
+        assert(value <= 0xffU);
+        _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
+    }
+    else if (kind == PyUnicode_2BYTE_KIND) {
+        assert(value <= 0xffffU);
+        _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
+    }
+    else {
+        assert(kind == PyUnicode_4BYTE_KIND);
+        assert(value <= 0x10ffffU);
+        _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
+    }
+}
+#define PyUnicode_WRITE(kind, data, index, value) \
+    PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
+                    (index), _Py_STATIC_CAST(Py_UCS4, value))
+
+/* Read a code point from the string's canonical representation.  No checks
+   or ready calls are performed. */
+static inline Py_UCS4 PyUnicode_READ(int kind,
+                                     const void *data, Py_ssize_t index)
+{
+    assert(index >= 0);
+    if (kind == PyUnicode_1BYTE_KIND) {
+        return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
+    }
+    if (kind == PyUnicode_2BYTE_KIND) {
+        return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
+    }
+    assert(kind == PyUnicode_4BYTE_KIND);
+    return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
+}
+#define PyUnicode_READ(kind, data, index) \
+    PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
+                   _Py_STATIC_CAST(const void*, data), \
+                   (index))
+
+/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
+   calls PyUnicode_KIND() and might call it twice.  For single reads, use
+   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
+   cache kind and use PyUnicode_READ instead. */
+static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
+{
+    int kind;
+
+    assert(index >= 0);
+    // Tolerate reading the NUL character at str[len(str)]
+    assert(index <= PyUnicode_GET_LENGTH(unicode));
+
+    kind = PyUnicode_KIND(unicode);
+    if (kind == PyUnicode_1BYTE_KIND) {
+        return PyUnicode_1BYTE_DATA(unicode)[index];
+    }
+    if (kind == PyUnicode_2BYTE_KIND) {
+        return PyUnicode_2BYTE_DATA(unicode)[index];
+    }
+    assert(kind == PyUnicode_4BYTE_KIND);
+    return PyUnicode_4BYTE_DATA(unicode)[index];
+}
+#define PyUnicode_READ_CHAR(unicode, index) \
+    PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
+
+/* Return a maximum character value which is suitable for creating another
+   string based on op.  This is always an approximation but more efficient
+   than iterating over the string. */
+static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
+{
+    int kind;
+
+    if (PyUnicode_IS_ASCII(op)) {
+        return 0x7fU;
+    }
+
+    kind = PyUnicode_KIND(op);
+    if (kind == PyUnicode_1BYTE_KIND) {
+       return 0xffU;
+    }
+    if (kind == PyUnicode_2BYTE_KIND) {
+        return 0xffffU;
+    }
+    assert(kind == PyUnicode_4BYTE_KIND);
+    return 0x10ffffU;
+}
+#define PyUnicode_MAX_CHAR_VALUE(op) \
+    PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
+
+
+/* === Public API ========================================================= */
+
+/* With PEP 393, this is the recommended way to allocate a new unicode object.
+   This function will allocate the object and its buffer in a single memory
+   block.  Objects created using this function are not resizable. */
+PyAPI_FUNC(PyObject*) PyUnicode_New(
+    Py_ssize_t size,            /* Number of code points in the new string */
+    Py_UCS4 maxchar             /* maximum code point value in the string */
+    );
+
+/* For backward compatibility */
+static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
+{
+    return 0;
+}
+#define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
+
+/* Copy character from one unicode object into another, this function performs
+   character conversion when necessary and falls back to memcpy() if possible.
+
+   Fail if to is too small (smaller than *how_many* or smaller than
+   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
+   kind(to), or if *to* has more than 1 reference.
+
+   Return the number of written character, or return -1 and raise an exception
+   on error.
+
+   Pseudo-code:
+
+       how_many = min(how_many, len(from) - from_start)
+       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
+       return how_many
+
+   Note: The function doesn't write a terminating null character.
+   */
+PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
+    PyObject *to,
+    Py_ssize_t to_start,
+    PyObject *from,
+    Py_ssize_t from_start,
+    Py_ssize_t how_many
+    );
+
+/* Fill a string with a character: write fill_char into
+   unicode[start:start+length].
+
+   Fail if fill_char is bigger than the string maximum character, or if the
+   string has more than 1 reference.
+
+   Return the number of written character, or return -1 and raise an exception
+   on error. */
+PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
+    PyObject *unicode,
+    Py_ssize_t start,
+    Py_ssize_t length,
+    Py_UCS4 fill_char
+    );
+
+/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
+   Scan the string to find the maximum character. */
+PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
+    int kind,
+    const void *buffer,
+    Py_ssize_t size);
+
+
+/* --- _PyUnicodeWriter API ----------------------------------------------- */
+
+typedef struct {
+    PyObject *buffer;
+    void *data;
+    int kind;
+    Py_UCS4 maxchar;
+    Py_ssize_t size;
+    Py_ssize_t pos;
+
+    /* minimum number of allocated characters (default: 0) */
+    Py_ssize_t min_length;
+
+    /* minimum character (default: 127, ASCII) */
+    Py_UCS4 min_char;
+
+    /* If non-zero, overallocate the buffer (default: 0). */
+    unsigned char overallocate;
+
+    /* If readonly is 1, buffer is a shared string (cannot be modified)
+       and size is set to 0. */
+    unsigned char readonly;
+} _PyUnicodeWriter ;
+
+// Initialize a Unicode writer.
+//
+// By default, the minimum buffer size is 0 character and overallocation is
+// disabled. Set min_length, min_char and overallocate attributes to control
+// the allocation of the buffer.
+PyAPI_FUNC(void)
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
+
+/* Prepare the buffer to write 'length' characters
+   with the specified maximum character.
+
+   Return 0 on success, raise an exception and return -1 on error. */
+#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
+    (((MAXCHAR) <= (WRITER)->maxchar                                  \
+      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
+     ? 0                                                              \
+     : (((LENGTH) == 0)                                               \
+        ? 0                                                           \
+        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
+
+/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
+   instead. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
+                                 Py_ssize_t length, Py_UCS4 maxchar);
+
+/* Prepare the buffer to have at least the kind KIND.
+   For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
+   support characters in range U+000-U+FFFF.
+
+   Return 0 on success, raise an exception and return -1 on error. */
+#define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
+    ((KIND) <= (WRITER)->kind                                         \
+     ? 0                                                              \
+     : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
+
+/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
+   macro instead. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
+                                     int kind);
+
+/* Append a Unicode character.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
+    Py_UCS4 ch
+    );
+
+/* Append a Unicode string.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
+    PyObject *str               /* Unicode string */
+    );
+
+/* Append a substring of a Unicode string.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
+    PyObject *str,              /* Unicode string */
+    Py_ssize_t start,
+    Py_ssize_t end
+    );
+
+/* Append an ASCII-encoded byte string.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
+    const char *str,           /* ASCII-encoded byte string */
+    Py_ssize_t len             /* number of bytes, or -1 if unknown */
+    );
+
+/* Append a latin1-encoded byte string.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
+    const char *str,           /* latin1-encoded byte string */
+    Py_ssize_t len             /* length in bytes */
+    );
+
+/* Get the value of the writer as a Unicode string. Clear the
+   buffer of the writer. Raise an exception and return NULL
+   on error. */
+PyAPI_FUNC(PyObject *)
+_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
+
+/* Deallocate memory of a writer (clear its internal buffer). */
+PyAPI_FUNC(void)
+_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
+
+
+/* --- Manage the default encoding ---------------------------------------- */
+
+/* Returns a pointer to the default encoding (UTF-8) of the
+   Unicode object unicode.
+
+   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
+   in the unicodeobject.
+
+   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
+   support the previous internal function with the same behaviour.
+
+   Use of this API is DEPRECATED since no size information can be
+   extracted from the returned data.
+*/
+
+PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
+
+// Alias kept for backward compatibility
+#define _PyUnicode_AsString PyUnicode_AsUTF8
+
+
+/* === Characters Type APIs =============================================== */
+
+/* These should not be used directly. Use the Py_UNICODE_IS* and
+   Py_UNICODE_TO* macros instead.
+
+   These APIs are implemented in Objects/unicodectype.c.
+
+*/
+
+PyAPI_FUNC(int) _PyUnicode_IsLowercase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsUppercase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
+    const Py_UCS4 ch         /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
+    const Py_UCS4 ch         /* Unicode character */
+    );
+
+PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_ToDigit(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(double) _PyUnicode_ToNumeric(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsDigit(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsNumeric(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsPrintable(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsAlpha(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+// Helper array used by Py_UNICODE_ISSPACE().
+PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
+
+// Since splitting on whitespace is an important use case, and
+// whitespace in most situations is solely ASCII whitespace, we
+// optimize for the common case by using a quick look-up table
+// _Py_ascii_whitespace (see below) with an inlined check.
+static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
+    if (ch < 128) {
+        return _Py_ascii_whitespace[ch];
+    }
+    return _PyUnicode_IsWhitespace(ch);
+}
+
+#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
+#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
+#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
+#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
+
+#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
+#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
+#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
+
+#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
+#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
+#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
+#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
+
+#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
+#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
+#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
+
+#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
+
+static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
+   return (Py_UNICODE_ISALPHA(ch)
+           || Py_UNICODE_ISDECIMAL(ch)
+           || Py_UNICODE_ISDIGIT(ch)
+           || Py_UNICODE_ISNUMERIC(ch));
+}
+
+
+/* === Misc functions ===================================================== */
+
+// Return an interned Unicode object for an Identifier; may fail if there is no
+// memory.
+PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
diff --git a/Include/cpython/warnings.h b/Include/cpython/warnings.h
new file mode 100644
index 0000000000000000000000000000000000000000..8731fd2e96b716fd87d988d40320e0ab4b15651f
--- /dev/null
+++ b/Include/cpython/warnings.h
@@ -0,0 +1,26 @@
+#ifndef Py_CPYTHON_WARNINGS_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(int) PyErr_WarnExplicitObject(
+    PyObject *category,
+    PyObject *message,
+    PyObject *filename,
+    int lineno,
+    PyObject *module,
+    PyObject *registry);
+
+PyAPI_FUNC(int) PyErr_WarnExplicitFormat(
+    PyObject *category,
+    const char *filename, int lineno,
+    const char *module, PyObject *registry,
+    const char *format, ...);
+
+// DEPRECATED: Use PyErr_WarnEx() instead.
+#define PyErr_Warn(category, msg) PyErr_WarnEx((category), (msg), 1)
+
+int _PyErr_WarnExplicitObjectWithContext(
+    PyObject *category,
+    PyObject *message,
+    PyObject *filename,
+    int lineno);
diff --git a/Include/cpython/weakrefobject.h b/Include/cpython/weakrefobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..28acf7265a08563af3de1deaf71cea08a6ec08d7
--- /dev/null
+++ b/Include/cpython/weakrefobject.h
@@ -0,0 +1,63 @@
+#ifndef Py_CPYTHON_WEAKREFOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* PyWeakReference is the base struct for the Python ReferenceType, ProxyType,
+ * and CallableProxyType.
+ */
+struct _PyWeakReference {
+    PyObject_HEAD
+
+    /* The object to which this is a weak reference, or Py_None if none.
+     * Note that this is a stealth reference:  wr_object's refcount is
+     * not incremented to reflect this pointer.
+     */
+    PyObject *wr_object;
+
+    /* A callable to invoke when wr_object dies, or NULL if none. */
+    PyObject *wr_callback;
+
+    /* A cache for wr_object's hash code.  As usual for hashes, this is -1
+     * if the hash code isn't known yet.
+     */
+    Py_hash_t hash;
+
+    /* If wr_object is weakly referenced, wr_object has a doubly-linked NULL-
+     * terminated list of weak references to it.  These are the list pointers.
+     * If wr_object goes away, wr_object is set to Py_None, and these pointers
+     * have no meaning then.
+     */
+    PyWeakReference *wr_prev;
+    PyWeakReference *wr_next;
+    vectorcallfunc vectorcall;
+
+#ifdef Py_GIL_DISABLED
+    /* Pointer to the lock used when clearing in free-threaded builds.
+     * Normally this can be derived from wr_object, but in some cases we need
+     * to lock after wr_object has been set to Py_None.
+     */
+    PyMutex *weakrefs_lock;
+#endif
+};
+
+PyAPI_FUNC(void) _PyWeakref_ClearRef(PyWeakReference *self);
+
+Py_DEPRECATED(3.13) static inline PyObject* PyWeakref_GET_OBJECT(PyObject *ref_obj)
+{
+    PyWeakReference *ref;
+    PyObject *obj;
+    assert(PyWeakref_Check(ref_obj));
+    ref = _Py_CAST(PyWeakReference*, ref_obj);
+    obj = ref->wr_object;
+    // Explanation for the Py_REFCNT() check: when a weakref's target is part
+    // of a long chain of deallocations which triggers the trashcan mechanism,
+    // clearing the weakrefs can be delayed long after the target's refcount
+    // has dropped to zero.  In the meantime, code accessing the weakref will
+    // be able to "see" the target object even though it is supposed to be
+    // unreachable.  See issue gh-60806.
+    if (Py_REFCNT(obj) > 0) {
+        return obj;
+    }
+    return Py_None;
+}
+#define PyWeakref_GET_OBJECT(ref) PyWeakref_GET_OBJECT(_PyObject_CAST(ref))
diff --git a/Include/critical_section.h b/Include/critical_section.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b37615a8b17e2a457b2043c7f24165984e8a4ef
--- /dev/null
+++ b/Include/critical_section.h
@@ -0,0 +1,16 @@
+#ifndef Py_CRITICAL_SECTION_H
+#define Py_CRITICAL_SECTION_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_CRITICAL_SECTION_H
+#  include "cpython/critical_section.h"
+#  undef Py_CPYTHON_CRITICAL_SECTION_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CRITICAL_SECTION_H */
diff --git a/Include/datetime.h b/Include/datetime.h
new file mode 100644
index 0000000000000000000000000000000000000000..b78cc0e8e2e5accd5952438b9fb09c056cc057b7
--- /dev/null
+++ b/Include/datetime.h
@@ -0,0 +1,267 @@
+/*  datetime.h
+ */
+#ifndef Py_LIMITED_API
+#ifndef DATETIME_H
+#define DATETIME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Fields are packed into successive bytes, each viewed as unsigned and
+ * big-endian, unless otherwise noted:
+ *
+ * byte offset
+ *  0           year     2 bytes, 1-9999
+ *  2           month    1 byte, 1-12
+ *  3           day      1 byte, 1-31
+ *  4           hour     1 byte, 0-23
+ *  5           minute   1 byte, 0-59
+ *  6           second   1 byte, 0-59
+ *  7           usecond  3 bytes, 0-999999
+ * 10
+ */
+
+/* # of bytes for year, month, and day. */
+#define _PyDateTime_DATE_DATASIZE 4
+
+/* # of bytes for hour, minute, second, and usecond. */
+#define _PyDateTime_TIME_DATASIZE 6
+
+/* # of bytes for year, month, day, hour, minute, second, and usecond. */
+#define _PyDateTime_DATETIME_DATASIZE 10
+
+
+typedef struct
+{
+    PyObject_HEAD
+    Py_hash_t hashcode;         /* -1 when unknown */
+    int days;                   /* -MAX_DELTA_DAYS <= days <= MAX_DELTA_DAYS */
+    int seconds;                /* 0 <= seconds < 24*3600 is invariant */
+    int microseconds;           /* 0 <= microseconds < 1000000 is invariant */
+} PyDateTime_Delta;
+
+typedef struct
+{
+    PyObject_HEAD               /* a pure abstract base class */
+} PyDateTime_TZInfo;
+
+
+/* The datetime and time types have hashcodes, and an optional tzinfo member,
+ * present if and only if hastzinfo is true.
+ */
+#define _PyTZINFO_HEAD          \
+    PyObject_HEAD               \
+    Py_hash_t hashcode;         \
+    char hastzinfo;             /* boolean flag */
+
+/* No _PyDateTime_BaseTZInfo is allocated; it's just to have something
+ * convenient to cast to, when getting at the hastzinfo member of objects
+ * starting with _PyTZINFO_HEAD.
+ */
+typedef struct
+{
+    _PyTZINFO_HEAD
+} _PyDateTime_BaseTZInfo;
+
+/* All time objects are of PyDateTime_TimeType, but that can be allocated
+ * in two ways, with or without a tzinfo member.  Without is the same as
+ * tzinfo == None, but consumes less memory.  _PyDateTime_BaseTime is an
+ * internal struct used to allocate the right amount of space for the
+ * "without" case.
+ */
+#define _PyDateTime_TIMEHEAD    \
+    _PyTZINFO_HEAD              \
+    unsigned char data[_PyDateTime_TIME_DATASIZE];
+
+typedef struct
+{
+    _PyDateTime_TIMEHEAD
+} _PyDateTime_BaseTime;         /* hastzinfo false */
+
+typedef struct
+{
+    _PyDateTime_TIMEHEAD
+    unsigned char fold;
+    PyObject *tzinfo;
+} PyDateTime_Time;              /* hastzinfo true */
+
+
+/* All datetime objects are of PyDateTime_DateTimeType, but that can be
+ * allocated in two ways too, just like for time objects above.  In addition,
+ * the plain date type is a base class for datetime, so it must also have
+ * a hastzinfo member (although it's unused there).
+ */
+typedef struct
+{
+    _PyTZINFO_HEAD
+    unsigned char data[_PyDateTime_DATE_DATASIZE];
+} PyDateTime_Date;
+
+#define _PyDateTime_DATETIMEHEAD        \
+    _PyTZINFO_HEAD                      \
+    unsigned char data[_PyDateTime_DATETIME_DATASIZE];
+
+typedef struct
+{
+    _PyDateTime_DATETIMEHEAD
+} _PyDateTime_BaseDateTime;     /* hastzinfo false */
+
+typedef struct
+{
+    _PyDateTime_DATETIMEHEAD
+    unsigned char fold;
+    PyObject *tzinfo;
+} PyDateTime_DateTime;          /* hastzinfo true */
+
+
+/* Apply for date and datetime instances. */
+
+// o is a pointer to a time or a datetime object.
+#define _PyDateTime_HAS_TZINFO(o)  (((_PyDateTime_BaseTZInfo *)(o))->hastzinfo)
+
+#define PyDateTime_GET_YEAR(o)     ((((PyDateTime_Date*)(o))->data[0] << 8) | \
+                     ((PyDateTime_Date*)(o))->data[1])
+#define PyDateTime_GET_MONTH(o)    (((PyDateTime_Date*)(o))->data[2])
+#define PyDateTime_GET_DAY(o)      (((PyDateTime_Date*)(o))->data[3])
+
+#define PyDateTime_DATE_GET_HOUR(o)        (((PyDateTime_DateTime*)(o))->data[4])
+#define PyDateTime_DATE_GET_MINUTE(o)      (((PyDateTime_DateTime*)(o))->data[5])
+#define PyDateTime_DATE_GET_SECOND(o)      (((PyDateTime_DateTime*)(o))->data[6])
+#define PyDateTime_DATE_GET_MICROSECOND(o)              \
+    ((((PyDateTime_DateTime*)(o))->data[7] << 16) |       \
+     (((PyDateTime_DateTime*)(o))->data[8] << 8)  |       \
+      ((PyDateTime_DateTime*)(o))->data[9])
+#define PyDateTime_DATE_GET_FOLD(o)        (((PyDateTime_DateTime*)(o))->fold)
+#define PyDateTime_DATE_GET_TZINFO(o)      (_PyDateTime_HAS_TZINFO((o)) ? \
+    ((PyDateTime_DateTime *)(o))->tzinfo : Py_None)
+
+/* Apply for time instances. */
+#define PyDateTime_TIME_GET_HOUR(o)        (((PyDateTime_Time*)(o))->data[0])
+#define PyDateTime_TIME_GET_MINUTE(o)      (((PyDateTime_Time*)(o))->data[1])
+#define PyDateTime_TIME_GET_SECOND(o)      (((PyDateTime_Time*)(o))->data[2])
+#define PyDateTime_TIME_GET_MICROSECOND(o)              \
+    ((((PyDateTime_Time*)(o))->data[3] << 16) |           \
+     (((PyDateTime_Time*)(o))->data[4] << 8)  |           \
+      ((PyDateTime_Time*)(o))->data[5])
+#define PyDateTime_TIME_GET_FOLD(o)        (((PyDateTime_Time*)(o))->fold)
+#define PyDateTime_TIME_GET_TZINFO(o)      (_PyDateTime_HAS_TZINFO(o) ? \
+    ((PyDateTime_Time *)(o))->tzinfo : Py_None)
+
+/* Apply for time delta instances */
+#define PyDateTime_DELTA_GET_DAYS(o)         (((PyDateTime_Delta*)(o))->days)
+#define PyDateTime_DELTA_GET_SECONDS(o)      (((PyDateTime_Delta*)(o))->seconds)
+#define PyDateTime_DELTA_GET_MICROSECONDS(o)            \
+    (((PyDateTime_Delta*)(o))->microseconds)
+
+
+/* Define structure for C API. */
+typedef struct {
+    /* type objects */
+    PyTypeObject *DateType;
+    PyTypeObject *DateTimeType;
+    PyTypeObject *TimeType;
+    PyTypeObject *DeltaType;
+    PyTypeObject *TZInfoType;
+
+    /* singletons */
+    PyObject *TimeZone_UTC;
+
+    /* constructors */
+    PyObject *(*Date_FromDate)(int, int, int, PyTypeObject*);
+    PyObject *(*DateTime_FromDateAndTime)(int, int, int, int, int, int, int,
+        PyObject*, PyTypeObject*);
+    PyObject *(*Time_FromTime)(int, int, int, int, PyObject*, PyTypeObject*);
+    PyObject *(*Delta_FromDelta)(int, int, int, int, PyTypeObject*);
+    PyObject *(*TimeZone_FromTimeZone)(PyObject *offset, PyObject *name);
+
+    /* constructors for the DB API */
+    PyObject *(*DateTime_FromTimestamp)(PyObject*, PyObject*, PyObject*);
+    PyObject *(*Date_FromTimestamp)(PyObject*, PyObject*);
+
+    /* PEP 495 constructors */
+    PyObject *(*DateTime_FromDateAndTimeAndFold)(int, int, int, int, int, int, int,
+        PyObject*, int, PyTypeObject*);
+    PyObject *(*Time_FromTimeAndFold)(int, int, int, int, PyObject*, int, PyTypeObject*);
+
+} PyDateTime_CAPI;
+
+#define PyDateTime_CAPSULE_NAME "datetime.datetime_CAPI"
+
+
+/* This block is only used as part of the public API and should not be
+ * included in _datetimemodule.c, which does not use the C API capsule.
+ * See bpo-35081 for more details.
+ * */
+#ifndef _PY_DATETIME_IMPL
+/* Define global variable for the C API and a macro for setting it. */
+static PyDateTime_CAPI *PyDateTimeAPI = NULL;
+
+#define PyDateTime_IMPORT \
+    PyDateTimeAPI = (PyDateTime_CAPI *)PyCapsule_Import(PyDateTime_CAPSULE_NAME, 0)
+
+/* Macro for access to the UTC singleton */
+#define PyDateTime_TimeZone_UTC PyDateTimeAPI->TimeZone_UTC
+
+/* Macros for type checking when not building the Python core. */
+#define PyDate_Check(op) PyObject_TypeCheck((op), PyDateTimeAPI->DateType)
+#define PyDate_CheckExact(op) Py_IS_TYPE((op), PyDateTimeAPI->DateType)
+
+#define PyDateTime_Check(op) PyObject_TypeCheck((op), PyDateTimeAPI->DateTimeType)
+#define PyDateTime_CheckExact(op) Py_IS_TYPE((op), PyDateTimeAPI->DateTimeType)
+
+#define PyTime_Check(op) PyObject_TypeCheck((op), PyDateTimeAPI->TimeType)
+#define PyTime_CheckExact(op) Py_IS_TYPE((op), PyDateTimeAPI->TimeType)
+
+#define PyDelta_Check(op) PyObject_TypeCheck((op), PyDateTimeAPI->DeltaType)
+#define PyDelta_CheckExact(op) Py_IS_TYPE((op), PyDateTimeAPI->DeltaType)
+
+#define PyTZInfo_Check(op) PyObject_TypeCheck((op), PyDateTimeAPI->TZInfoType)
+#define PyTZInfo_CheckExact(op) Py_IS_TYPE((op), PyDateTimeAPI->TZInfoType)
+
+
+/* Macros for accessing constructors in a simplified fashion. */
+#define PyDate_FromDate(year, month, day) \
+    PyDateTimeAPI->Date_FromDate((year), (month), (day), PyDateTimeAPI->DateType)
+
+#define PyDateTime_FromDateAndTime(year, month, day, hour, min, sec, usec) \
+    PyDateTimeAPI->DateTime_FromDateAndTime((year), (month), (day), (hour), \
+        (min), (sec), (usec), Py_None, PyDateTimeAPI->DateTimeType)
+
+#define PyDateTime_FromDateAndTimeAndFold(year, month, day, hour, min, sec, usec, fold) \
+    PyDateTimeAPI->DateTime_FromDateAndTimeAndFold((year), (month), (day), (hour), \
+        (min), (sec), (usec), Py_None, (fold), PyDateTimeAPI->DateTimeType)
+
+#define PyTime_FromTime(hour, minute, second, usecond) \
+    PyDateTimeAPI->Time_FromTime((hour), (minute), (second), (usecond), \
+        Py_None, PyDateTimeAPI->TimeType)
+
+#define PyTime_FromTimeAndFold(hour, minute, second, usecond, fold) \
+    PyDateTimeAPI->Time_FromTimeAndFold((hour), (minute), (second), (usecond), \
+        Py_None, (fold), PyDateTimeAPI->TimeType)
+
+#define PyDelta_FromDSU(days, seconds, useconds) \
+    PyDateTimeAPI->Delta_FromDelta((days), (seconds), (useconds), 1, \
+        PyDateTimeAPI->DeltaType)
+
+#define PyTimeZone_FromOffset(offset) \
+    PyDateTimeAPI->TimeZone_FromTimeZone((offset), NULL)
+
+#define PyTimeZone_FromOffsetAndName(offset, name) \
+    PyDateTimeAPI->TimeZone_FromTimeZone((offset), (name))
+
+/* Macros supporting the DB API. */
+#define PyDateTime_FromTimestamp(args) \
+    PyDateTimeAPI->DateTime_FromTimestamp( \
+        (PyObject*) (PyDateTimeAPI->DateTimeType), (args), NULL)
+
+#define PyDate_FromTimestamp(args) \
+    PyDateTimeAPI->Date_FromTimestamp( \
+        (PyObject*) (PyDateTimeAPI->DateType), (args))
+
+#endif   /* !defined(_PY_DATETIME_IMPL) */
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+#endif /* !Py_LIMITED_API */
diff --git a/Include/descrobject.h b/Include/descrobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd66d17b497a31f295d5791c49fb879fad8883cf
--- /dev/null
+++ b/Include/descrobject.h
@@ -0,0 +1,100 @@
+/* Descriptors */
+#ifndef Py_DESCROBJECT_H
+#define Py_DESCROBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef PyObject *(*getter)(PyObject *, void *);
+typedef int (*setter)(PyObject *, PyObject *, void *);
+
+struct PyGetSetDef {
+    const char *name;
+    getter get;
+    setter set;
+    const char *doc;
+    void *closure;
+};
+
+PyAPI_DATA(PyTypeObject) PyClassMethodDescr_Type;
+PyAPI_DATA(PyTypeObject) PyGetSetDescr_Type;
+PyAPI_DATA(PyTypeObject) PyMemberDescr_Type;
+PyAPI_DATA(PyTypeObject) PyMethodDescr_Type;
+PyAPI_DATA(PyTypeObject) PyWrapperDescr_Type;
+PyAPI_DATA(PyTypeObject) PyDictProxy_Type;
+PyAPI_DATA(PyTypeObject) PyProperty_Type;
+
+PyAPI_FUNC(PyObject *) PyDescr_NewMethod(PyTypeObject *, PyMethodDef *);
+PyAPI_FUNC(PyObject *) PyDescr_NewClassMethod(PyTypeObject *, PyMethodDef *);
+PyAPI_FUNC(PyObject *) PyDescr_NewMember(PyTypeObject *, PyMemberDef *);
+PyAPI_FUNC(PyObject *) PyDescr_NewGetSet(PyTypeObject *, PyGetSetDef *);
+
+PyAPI_FUNC(PyObject *) PyDictProxy_New(PyObject *);
+PyAPI_FUNC(PyObject *) PyWrapper_New(PyObject *, PyObject *);
+
+
+/* An array of PyMemberDef structures defines the name, type and offset
+   of selected members of a C structure.  These can be read by
+   PyMember_GetOne() and set by PyMember_SetOne() (except if their READONLY
+   flag is set).  The array must be terminated with an entry whose name
+   pointer is NULL. */
+struct PyMemberDef {
+    const char *name;
+    int type;
+    Py_ssize_t offset;
+    int flags;
+    const char *doc;
+};
+
+// These constants used to be in structmember.h, not prefixed by Py_.
+// (structmember.h now has aliases to the new names.)
+
+/* Types */
+#define Py_T_SHORT     0
+#define Py_T_INT       1
+#define Py_T_LONG      2
+#define Py_T_FLOAT     3
+#define Py_T_DOUBLE    4
+#define Py_T_STRING    5
+#define _Py_T_OBJECT   6  // Deprecated, use Py_T_OBJECT_EX instead
+/* the ordering here is weird for binary compatibility */
+#define Py_T_CHAR      7   /* 1-character string */
+#define Py_T_BYTE      8   /* 8-bit signed int */
+/* unsigned variants: */
+#define Py_T_UBYTE     9
+#define Py_T_USHORT    10
+#define Py_T_UINT      11
+#define Py_T_ULONG     12
+
+/* Added by Jack: strings contained in the structure */
+#define Py_T_STRING_INPLACE    13
+
+/* Added by Lillo: bools contained in the structure (assumed char) */
+#define Py_T_BOOL      14
+
+#define Py_T_OBJECT_EX 16
+#define Py_T_LONGLONG  17
+#define Py_T_ULONGLONG 18
+
+#define Py_T_PYSSIZET  19      /* Py_ssize_t */
+#define _Py_T_NONE     20 // Deprecated. Value is always None.
+
+/* Flags */
+#define Py_READONLY            1
+#define Py_AUDIT_READ          2 // Added in 3.10, harmless no-op before that
+#define _Py_WRITE_RESTRICTED   4 // Deprecated, no-op. Do not reuse the value.
+#define Py_RELATIVE_OFFSET     8
+
+PyAPI_FUNC(PyObject *) PyMember_GetOne(const char *, PyMemberDef *);
+PyAPI_FUNC(int) PyMember_SetOne(char *, PyMemberDef *, PyObject *);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_DESCROBJECT_H
+#  include "cpython/descrobject.h"
+#  undef Py_CPYTHON_DESCROBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_DESCROBJECT_H */
diff --git a/Include/dictobject.h b/Include/dictobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bbeec1ab699e7581d592bce151186582b6c3c82
--- /dev/null
+++ b/Include/dictobject.h
@@ -0,0 +1,108 @@
+#ifndef Py_DICTOBJECT_H
+#define Py_DICTOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Dictionary object type -- mapping from hashable object to object */
+
+/* The distribution includes a separate file, Objects/dictnotes.txt,
+   describing explorations into dictionary design and optimization.
+   It covers typical dictionary use patterns, the parameters for
+   tuning dictionaries, and several ideas for possible optimizations.
+*/
+
+PyAPI_DATA(PyTypeObject) PyDict_Type;
+
+#define PyDict_Check(op) \
+                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_DICT_SUBCLASS)
+#define PyDict_CheckExact(op) Py_IS_TYPE((op), &PyDict_Type)
+
+PyAPI_FUNC(PyObject *) PyDict_New(void);
+PyAPI_FUNC(PyObject *) PyDict_GetItem(PyObject *mp, PyObject *key);
+PyAPI_FUNC(PyObject *) PyDict_GetItemWithError(PyObject *mp, PyObject *key);
+PyAPI_FUNC(int) PyDict_SetItem(PyObject *mp, PyObject *key, PyObject *item);
+PyAPI_FUNC(int) PyDict_DelItem(PyObject *mp, PyObject *key);
+PyAPI_FUNC(void) PyDict_Clear(PyObject *mp);
+PyAPI_FUNC(int) PyDict_Next(
+    PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value);
+PyAPI_FUNC(PyObject *) PyDict_Keys(PyObject *mp);
+PyAPI_FUNC(PyObject *) PyDict_Values(PyObject *mp);
+PyAPI_FUNC(PyObject *) PyDict_Items(PyObject *mp);
+PyAPI_FUNC(Py_ssize_t) PyDict_Size(PyObject *mp);
+PyAPI_FUNC(PyObject *) PyDict_Copy(PyObject *mp);
+PyAPI_FUNC(int) PyDict_Contains(PyObject *mp, PyObject *key);
+
+/* PyDict_Update(mp, other) is equivalent to PyDict_Merge(mp, other, 1). */
+PyAPI_FUNC(int) PyDict_Update(PyObject *mp, PyObject *other);
+
+/* PyDict_Merge updates/merges from a mapping object (an object that
+   supports PyMapping_Keys() and PyObject_GetItem()).  If override is true,
+   the last occurrence of a key wins, else the first.  The Python
+   dict.update(other) is equivalent to PyDict_Merge(dict, other, 1).
+*/
+PyAPI_FUNC(int) PyDict_Merge(PyObject *mp,
+                             PyObject *other,
+                             int override);
+
+/* PyDict_MergeFromSeq2 updates/merges from an iterable object producing
+   iterable objects of length 2.  If override is true, the last occurrence
+   of a key wins, else the first.  The Python dict constructor dict(seq2)
+   is equivalent to dict={}; PyDict_MergeFromSeq(dict, seq2, 1).
+*/
+PyAPI_FUNC(int) PyDict_MergeFromSeq2(PyObject *d,
+                                     PyObject *seq2,
+                                     int override);
+
+PyAPI_FUNC(PyObject *) PyDict_GetItemString(PyObject *dp, const char *key);
+PyAPI_FUNC(int) PyDict_SetItemString(PyObject *dp, const char *key, PyObject *item);
+PyAPI_FUNC(int) PyDict_DelItemString(PyObject *dp, const char *key);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
+// Return the object from dictionary *op* which has a key *key*.
+// - If the key is present, set *result to a new strong reference to the value
+//   and return 1.
+// - If the key is missing, set *result to NULL and return 0 .
+// - On error, raise an exception and return -1.
+PyAPI_FUNC(int) PyDict_GetItemRef(PyObject *mp, PyObject *key, PyObject **result);
+PyAPI_FUNC(int) PyDict_GetItemStringRef(PyObject *mp, const char *key, PyObject **result);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+PyAPI_FUNC(PyObject *) PyObject_GenericGetDict(PyObject *, void *);
+#endif
+
+/* Dictionary (keys, values, items) views */
+
+PyAPI_DATA(PyTypeObject) PyDictKeys_Type;
+PyAPI_DATA(PyTypeObject) PyDictValues_Type;
+PyAPI_DATA(PyTypeObject) PyDictItems_Type;
+
+#define PyDictKeys_Check(op) PyObject_TypeCheck((op), &PyDictKeys_Type)
+#define PyDictValues_Check(op) PyObject_TypeCheck((op), &PyDictValues_Type)
+#define PyDictItems_Check(op) PyObject_TypeCheck((op), &PyDictItems_Type)
+/* This excludes Values, since they are not sets. */
+# define PyDictViewSet_Check(op) \
+    (PyDictKeys_Check(op) || PyDictItems_Check(op))
+
+/* Dictionary (key, value, items) iterators */
+
+PyAPI_DATA(PyTypeObject) PyDictIterKey_Type;
+PyAPI_DATA(PyTypeObject) PyDictIterValue_Type;
+PyAPI_DATA(PyTypeObject) PyDictIterItem_Type;
+
+PyAPI_DATA(PyTypeObject) PyDictRevIterKey_Type;
+PyAPI_DATA(PyTypeObject) PyDictRevIterItem_Type;
+PyAPI_DATA(PyTypeObject) PyDictRevIterValue_Type;
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_DICTOBJECT_H
+#  include "cpython/dictobject.h"
+#  undef Py_CPYTHON_DICTOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_DICTOBJECT_H */
diff --git a/Include/dynamic_annotations.h b/Include/dynamic_annotations.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d4def9bf8983e21209b598fc7cba728ca8c1d4c
--- /dev/null
+++ b/Include/dynamic_annotations.h
@@ -0,0 +1,499 @@
+/* Copyright (c) 2008-2009, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Kostya Serebryany
+ * Copied to CPython by Jeffrey Yasskin, with all macros renamed to
+ * start with _Py_ to avoid colliding with users embedding Python, and
+ * with deprecated macros removed.
+ */
+
+/* This file defines dynamic annotations for use with dynamic analysis
+   tool such as valgrind, PIN, etc.
+
+   Dynamic annotation is a source code annotation that affects
+   the generated code (that is, the annotation is not a comment).
+   Each such annotation is attached to a particular
+   instruction and/or to a particular object (address) in the program.
+
+   The annotations that should be used by users are macros in all upper-case
+   (e.g., _Py_ANNOTATE_NEW_MEMORY).
+
+   Actual implementation of these macros may differ depending on the
+   dynamic analysis tool being used.
+
+   See https://code.google.com/p/data-race-test/  for more information.
+
+   This file supports the following dynamic analysis tools:
+   - None (DYNAMIC_ANNOTATIONS_ENABLED is not defined or zero).
+      Macros are defined empty.
+   - ThreadSanitizer, Helgrind, DRD (DYNAMIC_ANNOTATIONS_ENABLED is 1).
+      Macros are defined as calls to non-inlinable empty functions
+      that are intercepted by Valgrind. */
+
+#ifndef __DYNAMIC_ANNOTATIONS_H__
+#define __DYNAMIC_ANNOTATIONS_H__
+
+#ifndef DYNAMIC_ANNOTATIONS_ENABLED
+# define DYNAMIC_ANNOTATIONS_ENABLED 0
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing condition variables such as CondVar,
+     using conditional critical sections (Await/LockWhen) and when constructing
+     user-defined synchronization mechanisms.
+
+     The annotations _Py_ANNOTATE_HAPPENS_BEFORE() and
+     _Py_ANNOTATE_HAPPENS_AFTER() can be used to define happens-before arcs in
+     user-defined synchronization mechanisms: the race detector will infer an
+     arc from the former to the latter when they share the same argument
+     pointer.
+
+     Example 1 (reference counting):
+
+     void Unref() {
+       _Py_ANNOTATE_HAPPENS_BEFORE(&refcount_);
+       if (AtomicDecrementByOne(&refcount_) == 0) {
+         _Py_ANNOTATE_HAPPENS_AFTER(&refcount_);
+         delete this;
+       }
+     }
+
+     Example 2 (message queue):
+
+     void MyQueue::Put(Type *e) {
+       MutexLock lock(&mu_);
+       _Py_ANNOTATE_HAPPENS_BEFORE(e);
+       PutElementIntoMyQueue(e);
+     }
+
+     Type *MyQueue::Get() {
+       MutexLock lock(&mu_);
+       Type *e = GetElementFromMyQueue();
+       _Py_ANNOTATE_HAPPENS_AFTER(e);
+       return e;
+     }
+
+     Note: when possible, please use the existing reference counting and message
+     queue implementations instead of inventing new ones. */
+
+  /* Report that wait on the condition variable at address "cv" has succeeded
+     and the lock at address "lock" is held. */
+#define _Py_ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) \
+    AnnotateCondVarWait(__FILE__, __LINE__, cv, lock)
+
+  /* Report that wait on the condition variable at "cv" has succeeded.  Variant
+     w/o lock. */
+#define _Py_ANNOTATE_CONDVAR_WAIT(cv) \
+    AnnotateCondVarWait(__FILE__, __LINE__, cv, NULL)
+
+  /* Report that we are about to signal on the condition variable at address
+     "cv". */
+#define _Py_ANNOTATE_CONDVAR_SIGNAL(cv) \
+    AnnotateCondVarSignal(__FILE__, __LINE__, cv)
+
+  /* Report that we are about to signal_all on the condition variable at "cv". */
+#define _Py_ANNOTATE_CONDVAR_SIGNAL_ALL(cv) \
+    AnnotateCondVarSignalAll(__FILE__, __LINE__, cv)
+
+  /* Annotations for user-defined synchronization mechanisms. */
+#define _Py_ANNOTATE_HAPPENS_BEFORE(obj) _Py_ANNOTATE_CONDVAR_SIGNAL(obj)
+#define _Py_ANNOTATE_HAPPENS_AFTER(obj)  _Py_ANNOTATE_CONDVAR_WAIT(obj)
+
+  /* Report that the bytes in the range [pointer, pointer+size) are about
+     to be published safely. The race checker will create a happens-before
+     arc from the call _Py_ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) to
+     subsequent accesses to this memory.
+     Note: this annotation may not work properly if the race detector uses
+     sampling, i.e. does not observe all memory accesses.
+     */
+#define _Py_ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) \
+    AnnotatePublishMemoryRange(__FILE__, __LINE__, pointer, size)
+
+  /* Instruct the tool to create a happens-before arc between mu->Unlock() and
+     mu->Lock(). This annotation may slow down the race detector and hide real
+     races. Normally it is used only when it would be difficult to annotate each
+     of the mutex's critical sections individually using the annotations above.
+     This annotation makes sense only for hybrid race detectors. For pure
+     happens-before detectors this is a no-op. For more details see
+     https://code.google.com/p/data-race-test/wiki/PureHappensBeforeVsHybrid . */
+#define _Py_ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) \
+    AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu)
+
+  /* -------------------------------------------------------------
+     Annotations useful when defining memory allocators, or when memory that
+     was protected in one way starts to be protected in another. */
+
+  /* Report that a new memory at "address" of size "size" has been allocated.
+     This might be used when the memory has been retrieved from a free list and
+     is about to be reused, or when the locking discipline for a variable
+     changes. */
+#define _Py_ANNOTATE_NEW_MEMORY(address, size) \
+    AnnotateNewMemory(__FILE__, __LINE__, address, size)
+
+  /* -------------------------------------------------------------
+     Annotations useful when defining FIFO queues that transfer data between
+     threads. */
+
+  /* Report that the producer-consumer queue (such as ProducerConsumerQueue) at
+     address "pcq" has been created.  The _Py_ANNOTATE_PCQ_* annotations should
+     be used only for FIFO queues.  For non-FIFO queues use
+     _Py_ANNOTATE_HAPPENS_BEFORE (for put) and _Py_ANNOTATE_HAPPENS_AFTER (for
+     get). */
+#define _Py_ANNOTATE_PCQ_CREATE(pcq) \
+    AnnotatePCQCreate(__FILE__, __LINE__, pcq)
+
+  /* Report that the queue at address "pcq" is about to be destroyed. */
+#define _Py_ANNOTATE_PCQ_DESTROY(pcq) \
+    AnnotatePCQDestroy(__FILE__, __LINE__, pcq)
+
+  /* Report that we are about to put an element into a FIFO queue at address
+     "pcq". */
+#define _Py_ANNOTATE_PCQ_PUT(pcq) \
+    AnnotatePCQPut(__FILE__, __LINE__, pcq)
+
+  /* Report that we've just got an element from a FIFO queue at address "pcq". */
+#define _Py_ANNOTATE_PCQ_GET(pcq) \
+    AnnotatePCQGet(__FILE__, __LINE__, pcq)
+
+  /* -------------------------------------------------------------
+     Annotations that suppress errors.  It is usually better to express the
+     program's synchronization using the other annotations, but these can
+     be used when all else fails. */
+
+  /* Report that we may have a benign race at "pointer", with size
+     "sizeof(*(pointer))". "pointer" must be a non-void* pointer.  Insert at the
+     point where "pointer" has been allocated, preferably close to the point
+     where the race happens.  See also _Py_ANNOTATE_BENIGN_RACE_STATIC. */
+#define _Py_ANNOTATE_BENIGN_RACE(pointer, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, pointer, \
+                            sizeof(*(pointer)), description)
+
+  /* Same as _Py_ANNOTATE_BENIGN_RACE(address, description), but applies to
+     the memory range [address, address+size). */
+#define _Py_ANNOTATE_BENIGN_RACE_SIZED(address, size, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, address, size, description)
+
+  /* Request the analysis tool to ignore all reads in the current thread
+     until _Py_ANNOTATE_IGNORE_READS_END is called.
+     Useful to ignore intentional racey reads, while still checking
+     other reads and all writes.
+     See also _Py_ANNOTATE_UNPROTECTED_READ. */
+#define _Py_ANNOTATE_IGNORE_READS_BEGIN() \
+    AnnotateIgnoreReadsBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring reads. */
+#define _Py_ANNOTATE_IGNORE_READS_END() \
+    AnnotateIgnoreReadsEnd(__FILE__, __LINE__)
+
+  /* Similar to _Py_ANNOTATE_IGNORE_READS_BEGIN, but ignore writes. */
+#define _Py_ANNOTATE_IGNORE_WRITES_BEGIN() \
+    AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring writes. */
+#define _Py_ANNOTATE_IGNORE_WRITES_END() \
+    AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+
+  /* Start ignoring all memory accesses (reads and writes). */
+#define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() \
+    do {\
+      _Py_ANNOTATE_IGNORE_READS_BEGIN();\
+      _Py_ANNOTATE_IGNORE_WRITES_BEGIN();\
+    }while(0)\
+
+  /* Stop ignoring all memory accesses. */
+#define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_END() \
+    do {\
+      _Py_ANNOTATE_IGNORE_WRITES_END();\
+      _Py_ANNOTATE_IGNORE_READS_END();\
+    }while(0)\
+
+  /* Similar to _Py_ANNOTATE_IGNORE_READS_BEGIN, but ignore synchronization events:
+     RWLOCK* and CONDVAR*. */
+#define _Py_ANNOTATE_IGNORE_SYNC_BEGIN() \
+    AnnotateIgnoreSyncBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring sync events. */
+#define _Py_ANNOTATE_IGNORE_SYNC_END() \
+    AnnotateIgnoreSyncEnd(__FILE__, __LINE__)
+
+
+  /* Enable (enable!=0) or disable (enable==0) race detection for all threads.
+     This annotation could be useful if you want to skip expensive race analysis
+     during some period of program execution, e.g. during initialization. */
+#define _Py_ANNOTATE_ENABLE_RACE_DETECTION(enable) \
+    AnnotateEnableRaceDetection(__FILE__, __LINE__, enable)
+
+  /* -------------------------------------------------------------
+     Annotations useful for debugging. */
+
+  /* Request to trace every access to "address". */
+#define _Py_ANNOTATE_TRACE_MEMORY(address) \
+    AnnotateTraceMemory(__FILE__, __LINE__, address)
+
+  /* Report the current thread name to a race detector. */
+#define _Py_ANNOTATE_THREAD_NAME(name) \
+    AnnotateThreadName(__FILE__, __LINE__, name)
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing locks.  They are not
+     normally needed by modules that merely use locks.
+     The "lock" argument is a pointer to the lock object. */
+
+  /* Report that a lock has been created at address "lock". */
+#define _Py_ANNOTATE_RWLOCK_CREATE(lock) \
+    AnnotateRWLockCreate(__FILE__, __LINE__, lock)
+
+  /* Report that the lock at address "lock" is about to be destroyed. */
+#define _Py_ANNOTATE_RWLOCK_DESTROY(lock) \
+    AnnotateRWLockDestroy(__FILE__, __LINE__, lock)
+
+  /* Report that the lock at address "lock" has been acquired.
+     is_w=1 for writer lock, is_w=0 for reader lock. */
+#define _Py_ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) \
+    AnnotateRWLockAcquired(__FILE__, __LINE__, lock, is_w)
+
+  /* Report that the lock at address "lock" is about to be released. */
+#define _Py_ANNOTATE_RWLOCK_RELEASED(lock, is_w) \
+    AnnotateRWLockReleased(__FILE__, __LINE__, lock, is_w)
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing barriers.  They are not
+     normally needed by modules that merely use barriers.
+     The "barrier" argument is a pointer to the barrier object. */
+
+  /* Report that the "barrier" has been initialized with initial "count".
+   If 'reinitialization_allowed' is true, initialization is allowed to happen
+   multiple times w/o calling barrier_destroy() */
+#define _Py_ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) \
+    AnnotateBarrierInit(__FILE__, __LINE__, barrier, count, \
+                        reinitialization_allowed)
+
+  /* Report that we are about to enter barrier_wait("barrier"). */
+#define _Py_ANNOTATE_BARRIER_WAIT_BEFORE(barrier) \
+    AnnotateBarrierWaitBefore(__FILE__, __LINE__, barrier)
+
+  /* Report that we just exited barrier_wait("barrier"). */
+#define _Py_ANNOTATE_BARRIER_WAIT_AFTER(barrier) \
+    AnnotateBarrierWaitAfter(__FILE__, __LINE__, barrier)
+
+  /* Report that the "barrier" has been destroyed. */
+#define _Py_ANNOTATE_BARRIER_DESTROY(barrier) \
+    AnnotateBarrierDestroy(__FILE__, __LINE__, barrier)
+
+  /* -------------------------------------------------------------
+     Annotations useful for testing race detectors. */
+
+  /* Report that we expect a race on the variable at "address".
+     Use only in unit tests for a race detector. */
+#define _Py_ANNOTATE_EXPECT_RACE(address, description) \
+    AnnotateExpectRace(__FILE__, __LINE__, address, description)
+
+  /* A no-op. Insert where you like to test the interceptors. */
+#define _Py_ANNOTATE_NO_OP(arg) \
+    AnnotateNoOp(__FILE__, __LINE__, arg)
+
+  /* Force the race detector to flush its state. The actual effect depends on
+   * the implementation of the detector. */
+#define _Py_ANNOTATE_FLUSH_STATE() \
+    AnnotateFlushState(__FILE__, __LINE__)
+
+
+#else  /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
+
+#define _Py_ANNOTATE_RWLOCK_CREATE(lock) /* empty */
+#define _Py_ANNOTATE_RWLOCK_DESTROY(lock) /* empty */
+#define _Py_ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) /* empty */
+#define _Py_ANNOTATE_RWLOCK_RELEASED(lock, is_w) /* empty */
+#define _Py_ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) /* */
+#define _Py_ANNOTATE_BARRIER_WAIT_BEFORE(barrier) /* empty */
+#define _Py_ANNOTATE_BARRIER_WAIT_AFTER(barrier) /* empty */
+#define _Py_ANNOTATE_BARRIER_DESTROY(barrier) /* empty */
+#define _Py_ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) /* empty */
+#define _Py_ANNOTATE_CONDVAR_WAIT(cv) /* empty */
+#define _Py_ANNOTATE_CONDVAR_SIGNAL(cv) /* empty */
+#define _Py_ANNOTATE_CONDVAR_SIGNAL_ALL(cv) /* empty */
+#define _Py_ANNOTATE_HAPPENS_BEFORE(obj) /* empty */
+#define _Py_ANNOTATE_HAPPENS_AFTER(obj) /* empty */
+#define _Py_ANNOTATE_PUBLISH_MEMORY_RANGE(address, size) /* empty */
+#define _Py_ANNOTATE_UNPUBLISH_MEMORY_RANGE(address, size)  /* empty */
+#define _Py_ANNOTATE_SWAP_MEMORY_RANGE(address, size)  /* empty */
+#define _Py_ANNOTATE_PCQ_CREATE(pcq) /* empty */
+#define _Py_ANNOTATE_PCQ_DESTROY(pcq) /* empty */
+#define _Py_ANNOTATE_PCQ_PUT(pcq) /* empty */
+#define _Py_ANNOTATE_PCQ_GET(pcq) /* empty */
+#define _Py_ANNOTATE_NEW_MEMORY(address, size) /* empty */
+#define _Py_ANNOTATE_EXPECT_RACE(address, description) /* empty */
+#define _Py_ANNOTATE_BENIGN_RACE(address, description) /* empty */
+#define _Py_ANNOTATE_BENIGN_RACE_SIZED(address, size, description) /* empty */
+#define _Py_ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) /* empty */
+#define _Py_ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) /* empty */
+#define _Py_ANNOTATE_TRACE_MEMORY(arg) /* empty */
+#define _Py_ANNOTATE_THREAD_NAME(name) /* empty */
+#define _Py_ANNOTATE_IGNORE_READS_BEGIN() /* empty */
+#define _Py_ANNOTATE_IGNORE_READS_END() /* empty */
+#define _Py_ANNOTATE_IGNORE_WRITES_BEGIN() /* empty */
+#define _Py_ANNOTATE_IGNORE_WRITES_END() /* empty */
+#define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() /* empty */
+#define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_END() /* empty */
+#define _Py_ANNOTATE_IGNORE_SYNC_BEGIN() /* empty */
+#define _Py_ANNOTATE_IGNORE_SYNC_END() /* empty */
+#define _Py_ANNOTATE_ENABLE_RACE_DETECTION(enable) /* empty */
+#define _Py_ANNOTATE_NO_OP(arg) /* empty */
+#define _Py_ANNOTATE_FLUSH_STATE() /* empty */
+
+#endif  /* DYNAMIC_ANNOTATIONS_ENABLED */
+
+/* Use the macros above rather than using these functions directly. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+void AnnotateRWLockCreate(const char *file, int line,
+                          const volatile void *lock);
+void AnnotateRWLockDestroy(const char *file, int line,
+                           const volatile void *lock);
+void AnnotateRWLockAcquired(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateRWLockReleased(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateBarrierInit(const char *file, int line,
+                         const volatile void *barrier, long count,
+                         long reinitialization_allowed);
+void AnnotateBarrierWaitBefore(const char *file, int line,
+                               const volatile void *barrier);
+void AnnotateBarrierWaitAfter(const char *file, int line,
+                              const volatile void *barrier);
+void AnnotateBarrierDestroy(const char *file, int line,
+                            const volatile void *barrier);
+void AnnotateCondVarWait(const char *file, int line,
+                         const volatile void *cv,
+                         const volatile void *lock);
+void AnnotateCondVarSignal(const char *file, int line,
+                           const volatile void *cv);
+void AnnotateCondVarSignalAll(const char *file, int line,
+                              const volatile void *cv);
+void AnnotatePublishMemoryRange(const char *file, int line,
+                                const volatile void *address,
+                                long size);
+void AnnotateUnpublishMemoryRange(const char *file, int line,
+                                  const volatile void *address,
+                                  long size);
+void AnnotatePCQCreate(const char *file, int line,
+                       const volatile void *pcq);
+void AnnotatePCQDestroy(const char *file, int line,
+                        const volatile void *pcq);
+void AnnotatePCQPut(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotatePCQGet(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotateNewMemory(const char *file, int line,
+                       const volatile void *address,
+                       long size);
+void AnnotateExpectRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRaceSized(const char *file, int line,
+                        const volatile void *address,
+                        long size,
+                        const char *description);
+void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
+                                  const volatile void *mu);
+void AnnotateTraceMemory(const char *file, int line,
+                         const volatile void *arg);
+void AnnotateThreadName(const char *file, int line,
+                        const char *name);
+void AnnotateIgnoreReadsBegin(const char *file, int line);
+void AnnotateIgnoreReadsEnd(const char *file, int line);
+void AnnotateIgnoreWritesBegin(const char *file, int line);
+void AnnotateIgnoreWritesEnd(const char *file, int line);
+void AnnotateEnableRaceDetection(const char *file, int line, int enable);
+void AnnotateNoOp(const char *file, int line,
+                  const volatile void *arg);
+void AnnotateFlushState(const char *file, int line);
+
+/* Return non-zero value if running under valgrind.
+
+  If "valgrind.h" is included into dynamic_annotations.c,
+  the regular valgrind mechanism will be used.
+  See http://valgrind.org/docs/manual/manual-core-adv.html about
+  RUNNING_ON_VALGRIND and other valgrind "client requests".
+  The file "valgrind.h" may be obtained by doing
+     svn co svn://svn.valgrind.org/valgrind/trunk/include
+
+  If for some reason you can't use "valgrind.h" or want to fake valgrind,
+  there are two ways to make this function return non-zero:
+    - Use environment variable: export RUNNING_ON_VALGRIND=1
+    - Make your tool intercept the function RunningOnValgrind() and
+      change its return value.
+ */
+int RunningOnValgrind(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0 && defined(__cplusplus)
+
+  /* _Py_ANNOTATE_UNPROTECTED_READ is the preferred way to annotate racey reads.
+
+     Instead of doing
+        _Py_ANNOTATE_IGNORE_READS_BEGIN();
+        ... = x;
+        _Py_ANNOTATE_IGNORE_READS_END();
+     one can use
+        ... = _Py_ANNOTATE_UNPROTECTED_READ(x); */
+  template <class T>
+  inline T _Py_ANNOTATE_UNPROTECTED_READ(const volatile T &x) {
+    _Py_ANNOTATE_IGNORE_READS_BEGIN();
+    T res = x;
+    _Py_ANNOTATE_IGNORE_READS_END();
+    return res;
+  }
+  /* Apply _Py_ANNOTATE_BENIGN_RACE_SIZED to a static variable. */
+#define _Py_ANNOTATE_BENIGN_RACE_STATIC(static_var, description)        \
+    namespace {                                                       \
+      class static_var ## _annotator {                                \
+       public:                                                        \
+        static_var ## _annotator() {                                  \
+          _Py_ANNOTATE_BENIGN_RACE_SIZED(&static_var,                     \
+                                      sizeof(static_var),             \
+            # static_var ": " description);                           \
+        }                                                             \
+      };                                                              \
+      static static_var ## _annotator the ## static_var ## _annotator;\
+    }
+#else /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
+
+#define _Py_ANNOTATE_UNPROTECTED_READ(x) (x)
+#define _Py_ANNOTATE_BENIGN_RACE_STATIC(static_var, description)  /* empty */
+
+#endif /* DYNAMIC_ANNOTATIONS_ENABLED */
+
+#endif  /* __DYNAMIC_ANNOTATIONS_H__ */
diff --git a/Include/enumobject.h b/Include/enumobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..c14dbfc8c37e7c9316b9cca0a5969ee925d729d8
--- /dev/null
+++ b/Include/enumobject.h
@@ -0,0 +1,17 @@
+#ifndef Py_ENUMOBJECT_H
+#define Py_ENUMOBJECT_H
+
+/* Enumerate Object */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyEnum_Type;
+PyAPI_DATA(PyTypeObject) PyReversed_Type;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !Py_ENUMOBJECT_H */
diff --git a/Include/errcode.h b/Include/errcode.h
new file mode 100644
index 0000000000000000000000000000000000000000..dac5cf068c99d6d4ab1b678c48da3e6f1e2660ee
--- /dev/null
+++ b/Include/errcode.h
@@ -0,0 +1,45 @@
+// Error codes passed around between file input, tokenizer, parser and
+// interpreter.  This is necessary so we can turn them into Python
+// exceptions at a higher level.  Note that some errors have a
+// slightly different meaning when passed from the tokenizer to the
+// parser than when passed from the parser to the interpreter; e.g.
+// the parser only returns E_EOF when it hits EOF immediately, and it
+// never returns E_OK.
+//
+// The public PyRun_InteractiveOneObjectEx() function can return E_EOF,
+// same as its variants:
+//
+// * PyRun_InteractiveOneObject()
+// * PyRun_InteractiveOneFlags()
+// * PyRun_InteractiveOne()
+
+#ifndef Py_ERRCODE_H
+#define Py_ERRCODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define E_OK             10      /* No error */
+#define E_EOF            11      /* End Of File */
+#define E_INTR           12      /* Interrupted */
+#define E_TOKEN          13      /* Bad token */
+#define E_SYNTAX         14      /* Syntax error */
+#define E_NOMEM          15      /* Ran out of memory */
+#define E_DONE           16      /* Parsing complete */
+#define E_ERROR          17      /* Execution error */
+#define E_TABSPACE       18      /* Inconsistent mixing of tabs and spaces */
+#define E_OVERFLOW       19      /* Node had too many children */
+#define E_TOODEEP        20      /* Too many indentation levels */
+#define E_DEDENT         21      /* No matching outer block for dedent */
+#define E_DECODE         22      /* Error in decoding into Unicode */
+#define E_EOFS           23      /* EOF in triple-quoted string */
+#define E_EOLS           24      /* EOL in single-quoted string */
+#define E_LINECONT       25      /* Unexpected characters after a line continuation */
+#define E_BADSINGLE      27      /* Ill-formed single statement input */
+#define E_INTERACT_STOP  28      /* Interactive mode stopped tokenization */
+#define E_COLUMNOVERFLOW 29      /* Column offset overflow */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_ERRCODE_H */
diff --git a/Include/exports.h b/Include/exports.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce601216f171563df33cec3e2ed5cbe42aad5a00
--- /dev/null
+++ b/Include/exports.h
@@ -0,0 +1,108 @@
+#ifndef Py_EXPORTS_H
+#define Py_EXPORTS_H
+
+/* Declarations for symbol visibility.
+
+  PyAPI_FUNC(type): Declares a public Python API function and return type
+  PyAPI_DATA(type): Declares public Python data and its type
+  PyMODINIT_FUNC:   A Python module init function.  If these functions are
+                    inside the Python core, they are private to the core.
+                    If in an extension module, it may be declared with
+                    external linkage depending on the platform.
+
+  As a number of platforms support/require "__declspec(dllimport/dllexport)",
+  we support a HAVE_DECLSPEC_DLL macro to save duplication.
+*/
+
+/*
+  All windows ports, except cygwin, are handled in PC/pyconfig.h.
+
+  Cygwin is the only other autoconf platform requiring special
+  linkage handling and it uses __declspec().
+*/
+#if defined(__CYGWIN__)
+#       define HAVE_DECLSPEC_DLL
+#endif
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+    #if defined(Py_ENABLE_SHARED)
+        #define Py_IMPORTED_SYMBOL __declspec(dllimport)
+        #define Py_EXPORTED_SYMBOL __declspec(dllexport)
+        #define Py_LOCAL_SYMBOL
+    #else
+        #define Py_IMPORTED_SYMBOL
+        #define Py_EXPORTED_SYMBOL
+        #define Py_LOCAL_SYMBOL
+    #endif
+#else
+/*
+ * If we only ever used gcc >= 5, we could use __has_attribute(visibility)
+ * as a cross-platform way to determine if visibility is supported. However,
+ * we may still need to support gcc >= 4, as some Ubuntu LTS and Centos versions
+ * have 4 < gcc < 5.
+ */
+    #ifndef __has_attribute
+      #define __has_attribute(x) 0  // Compatibility with non-clang compilers.
+    #endif
+    #if (defined(__GNUC__) && (__GNUC__ >= 4)) ||\
+        (defined(__clang__) && __has_attribute(visibility))
+        #define Py_IMPORTED_SYMBOL __attribute__ ((visibility ("default")))
+        #define Py_EXPORTED_SYMBOL __attribute__ ((visibility ("default")))
+        #define Py_LOCAL_SYMBOL  __attribute__ ((visibility ("hidden")))
+    #else
+        #define Py_IMPORTED_SYMBOL
+        #define Py_EXPORTED_SYMBOL
+        #define Py_LOCAL_SYMBOL
+    #endif
+#endif
+
+/* only get special linkage if built as shared or platform is Cygwin */
+#if defined(Py_ENABLE_SHARED) || defined(__CYGWIN__)
+#       if defined(HAVE_DECLSPEC_DLL)
+#               if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+#                       define PyAPI_FUNC(RTYPE) Py_EXPORTED_SYMBOL RTYPE
+#                       define PyAPI_DATA(RTYPE) extern Py_EXPORTED_SYMBOL RTYPE
+        /* module init functions inside the core need no external linkage */
+        /* except for Cygwin to handle embedding */
+#                       if defined(__CYGWIN__)
+#                               define PyMODINIT_FUNC Py_EXPORTED_SYMBOL PyObject*
+#                       else /* __CYGWIN__ */
+#                               define PyMODINIT_FUNC PyObject*
+#                       endif /* __CYGWIN__ */
+#               else /* Py_BUILD_CORE */
+        /* Building an extension module, or an embedded situation */
+        /* public Python functions and data are imported */
+        /* Under Cygwin, auto-import functions to prevent compilation */
+        /* failures similar to those described at the bottom of 4.1: */
+        /* http://docs.python.org/extending/windows.html#a-cookbook-approach */
+#                       if !defined(__CYGWIN__)
+#                               define PyAPI_FUNC(RTYPE) Py_IMPORTED_SYMBOL RTYPE
+#                       endif /* !__CYGWIN__ */
+#                       define PyAPI_DATA(RTYPE) extern Py_IMPORTED_SYMBOL RTYPE
+        /* module init functions outside the core must be exported */
+#                       if defined(__cplusplus)
+#                               define PyMODINIT_FUNC extern "C" Py_EXPORTED_SYMBOL PyObject*
+#                       else /* __cplusplus */
+#                               define PyMODINIT_FUNC Py_EXPORTED_SYMBOL PyObject*
+#                       endif /* __cplusplus */
+#               endif /* Py_BUILD_CORE */
+#       endif /* HAVE_DECLSPEC_DLL */
+#endif /* Py_ENABLE_SHARED */
+
+/* If no external linkage macros defined by now, create defaults */
+#ifndef PyAPI_FUNC
+#       define PyAPI_FUNC(RTYPE) Py_EXPORTED_SYMBOL RTYPE
+#endif
+#ifndef PyAPI_DATA
+#       define PyAPI_DATA(RTYPE) extern Py_EXPORTED_SYMBOL RTYPE
+#endif
+#ifndef PyMODINIT_FUNC
+#       if defined(__cplusplus)
+#               define PyMODINIT_FUNC extern "C" Py_EXPORTED_SYMBOL PyObject*
+#       else /* __cplusplus */
+#               define PyMODINIT_FUNC Py_EXPORTED_SYMBOL PyObject*
+#       endif /* __cplusplus */
+#endif
+
+
+#endif /* Py_EXPORTS_H */
diff --git a/Include/fileobject.h b/Include/fileobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a6d11409497fab0fac0b20851af8786fd09545a
--- /dev/null
+++ b/Include/fileobject.h
@@ -0,0 +1,41 @@
+/* File object interface (what's left of it -- see io.py) */
+
+#ifndef Py_FILEOBJECT_H
+#define Py_FILEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PY_STDIOTEXTMODE "b"
+
+PyAPI_FUNC(PyObject *) PyFile_FromFd(int, const char *, const char *, int,
+                                     const char *, const char *,
+                                     const char *, int);
+PyAPI_FUNC(PyObject *) PyFile_GetLine(PyObject *, int);
+PyAPI_FUNC(int) PyFile_WriteObject(PyObject *, PyObject *, int);
+PyAPI_FUNC(int) PyFile_WriteString(const char *, PyObject *);
+PyAPI_FUNC(int) PyObject_AsFileDescriptor(PyObject *);
+
+/* The default encoding used by the platform file system APIs
+   If non-NULL, this is different than the default encoding for strings
+*/
+Py_DEPRECATED(3.12) PyAPI_DATA(const char *) Py_FileSystemDefaultEncoding;
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+Py_DEPRECATED(3.12) PyAPI_DATA(const char *) Py_FileSystemDefaultEncodeErrors;
+#endif
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_HasFileSystemDefaultEncoding;
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_UTF8Mode;
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_FILEOBJECT_H
+#  include "cpython/fileobject.h"
+#  undef Py_CPYTHON_FILEOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_FILEOBJECT_H */
diff --git a/Include/fileutils.h b/Include/fileutils.h
new file mode 100644
index 0000000000000000000000000000000000000000..1509198e45f0cae4ae81a883890ea542c275cf81
--- /dev/null
+++ b/Include/fileutils.h
@@ -0,0 +1,62 @@
+#ifndef Py_FILEUTILS_H
+#define Py_FILEUTILS_H
+
+/*******************************
+ * stat() and fstat() fiddling *
+ *******************************/
+
+#ifdef HAVE_SYS_STAT_H
+#  include <sys/stat.h>           // S_ISREG()
+#elif defined(HAVE_STAT_H)
+#  include <stat.h>               // S_ISREG()
+#endif
+
+#ifndef S_IFMT
+   // VisualAge C/C++ Failed to Define MountType Field in sys/stat.h.
+#  define S_IFMT 0170000
+#endif
+#ifndef S_IFLNK
+   // Windows doesn't define S_IFLNK, but posixmodule.c maps
+   // IO_REPARSE_TAG_SYMLINK to S_IFLNK.
+#  define S_IFLNK 0120000
+#endif
+#ifndef S_ISREG
+#  define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
+#endif
+#ifndef S_ISDIR
+#  define S_ISDIR(x) (((x) & S_IFMT) == S_IFDIR)
+#endif
+#ifndef S_ISCHR
+#  define S_ISCHR(x) (((x) & S_IFMT) == S_IFCHR)
+#endif
+#ifndef S_ISLNK
+#  define S_ISLNK(x) (((x) & S_IFMT) == S_IFLNK)
+#endif
+
+
+// Move this down here since some C++ #include's don't like to be included
+// inside an extern "C".
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_FUNC(wchar_t *) Py_DecodeLocale(
+    const char *arg,
+    size_t *size);
+
+PyAPI_FUNC(char*) Py_EncodeLocale(
+    const wchar_t *text,
+    size_t *error_pos);
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_FILEUTILS_H
+#  include "cpython/fileutils.h"
+#  undef Py_CPYTHON_FILEUTILS_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_FILEUTILS_H */
diff --git a/Include/floatobject.h b/Include/floatobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..8963c16832a4bc6f7f38b9db2327e76ba44464e5
--- /dev/null
+++ b/Include/floatobject.h
@@ -0,0 +1,54 @@
+
+/* Float object interface */
+
+/*
+PyFloatObject represents a (double precision) floating-point number.
+*/
+
+#ifndef Py_FLOATOBJECT_H
+#define Py_FLOATOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyFloat_Type;
+
+#define PyFloat_Check(op) PyObject_TypeCheck(op, &PyFloat_Type)
+#define PyFloat_CheckExact(op) Py_IS_TYPE((op), &PyFloat_Type)
+
+#define Py_RETURN_NAN return PyFloat_FromDouble(Py_NAN)
+
+#define Py_RETURN_INF(sign)                          \
+    do {                                             \
+        if (copysign(1., sign) == 1.) {              \
+            return PyFloat_FromDouble(Py_HUGE_VAL);  \
+        }                                            \
+        else {                                       \
+            return PyFloat_FromDouble(-Py_HUGE_VAL); \
+        }                                            \
+    } while(0)
+
+PyAPI_FUNC(double) PyFloat_GetMax(void);
+PyAPI_FUNC(double) PyFloat_GetMin(void);
+PyAPI_FUNC(PyObject*) PyFloat_GetInfo(void);
+
+/* Return Python float from string PyObject. */
+PyAPI_FUNC(PyObject*) PyFloat_FromString(PyObject*);
+
+/* Return Python float from C double. */
+PyAPI_FUNC(PyObject*) PyFloat_FromDouble(double);
+
+/* Extract C double from Python float.  The macro version trades safety for
+   speed. */
+PyAPI_FUNC(double) PyFloat_AsDouble(PyObject*);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_FLOATOBJECT_H
+#  include "cpython/floatobject.h"
+#  undef Py_CPYTHON_FLOATOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_FLOATOBJECT_H */
diff --git a/Include/frameobject.h b/Include/frameobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..adb628f6314fcfc6bff80b4bb09483158354b91f
--- /dev/null
+++ b/Include/frameobject.h
@@ -0,0 +1,20 @@
+/* Frame object interface */
+
+#ifndef Py_FRAMEOBJECT_H
+#define Py_FRAMEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "pyframe.h"
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_FRAMEOBJECT_H
+#  include "cpython/frameobject.h"
+#  undef Py_CPYTHON_FRAMEOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_FRAMEOBJECT_H */
diff --git a/Include/genericaliasobject.h b/Include/genericaliasobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf002976b27cd7712ee80bf348f55c5c562bc12d
--- /dev/null
+++ b/Include/genericaliasobject.h
@@ -0,0 +1,14 @@
+// Implementation of PEP 585: support list[int] etc.
+#ifndef Py_GENERICALIASOBJECT_H
+#define Py_GENERICALIASOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(PyObject *) Py_GenericAlias(PyObject *, PyObject *);
+PyAPI_DATA(PyTypeObject) Py_GenericAliasType;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_GENERICALIASOBJECT_H */
diff --git a/Include/import.h b/Include/import.h
new file mode 100644
index 0000000000000000000000000000000000000000..24b23b9119196fcb5908ae70e035a884543bcc05
--- /dev/null
+++ b/Include/import.h
@@ -0,0 +1,103 @@
+/* Module definition and import interface */
+
+#ifndef Py_IMPORT_H
+#define Py_IMPORT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(long) PyImport_GetMagicNumber(void);
+PyAPI_FUNC(const char *) PyImport_GetMagicTag(void);
+PyAPI_FUNC(PyObject *) PyImport_ExecCodeModule(
+    const char *name,           /* UTF-8 encoded string */
+    PyObject *co
+    );
+PyAPI_FUNC(PyObject *) PyImport_ExecCodeModuleEx(
+    const char *name,           /* UTF-8 encoded string */
+    PyObject *co,
+    const char *pathname        /* decoded from the filesystem encoding */
+    );
+PyAPI_FUNC(PyObject *) PyImport_ExecCodeModuleWithPathnames(
+    const char *name,           /* UTF-8 encoded string */
+    PyObject *co,
+    const char *pathname,       /* decoded from the filesystem encoding */
+    const char *cpathname       /* decoded from the filesystem encoding */
+    );
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyImport_ExecCodeModuleObject(
+    PyObject *name,
+    PyObject *co,
+    PyObject *pathname,
+    PyObject *cpathname
+    );
+#endif
+PyAPI_FUNC(PyObject *) PyImport_GetModuleDict(void);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+PyAPI_FUNC(PyObject *) PyImport_GetModule(PyObject *name);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyImport_AddModuleObject(
+    PyObject *name
+    );
+#endif
+PyAPI_FUNC(PyObject *) PyImport_AddModule(
+    const char *name            /* UTF-8 encoded string */
+    );
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+PyAPI_FUNC(PyObject *) PyImport_AddModuleRef(
+    const char *name            /* UTF-8 encoded string */
+    );
+#endif
+PyAPI_FUNC(PyObject *) PyImport_ImportModule(
+    const char *name            /* UTF-8 encoded string */
+    );
+Py_DEPRECATED(3.13) PyAPI_FUNC(PyObject *) PyImport_ImportModuleNoBlock(
+    const char *name            /* UTF-8 encoded string */
+    );
+PyAPI_FUNC(PyObject *) PyImport_ImportModuleLevel(
+    const char *name,           /* UTF-8 encoded string */
+    PyObject *globals,
+    PyObject *locals,
+    PyObject *fromlist,
+    int level
+    );
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_FUNC(PyObject *) PyImport_ImportModuleLevelObject(
+    PyObject *name,
+    PyObject *globals,
+    PyObject *locals,
+    PyObject *fromlist,
+    int level
+    );
+#endif
+
+#define PyImport_ImportModuleEx(n, g, l, f) \
+    PyImport_ImportModuleLevel((n), (g), (l), (f), 0)
+
+PyAPI_FUNC(PyObject *) PyImport_GetImporter(PyObject *path);
+PyAPI_FUNC(PyObject *) PyImport_Import(PyObject *name);
+PyAPI_FUNC(PyObject *) PyImport_ReloadModule(PyObject *m);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(int) PyImport_ImportFrozenModuleObject(
+    PyObject *name
+    );
+#endif
+PyAPI_FUNC(int) PyImport_ImportFrozenModule(
+    const char *name            /* UTF-8 encoded string */
+    );
+
+PyAPI_FUNC(int) PyImport_AppendInittab(
+    const char *name,           /* ASCII encoded string */
+    PyObject* (*initfunc)(void)
+    );
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_IMPORT_H
+#  include "cpython/import.h"
+#  undef Py_CPYTHON_IMPORT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_IMPORT_H */
diff --git a/Include/internal/mimalloc/mimalloc.h b/Include/internal/mimalloc/mimalloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..821129e7690b1b267dfe802a8c8a8b913f7299ea
--- /dev/null
+++ b/Include/internal/mimalloc/mimalloc.h
@@ -0,0 +1,565 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_H
+#define MIMALLOC_H
+
+#define MI_MALLOC_VERSION 212   // major + 2 digits minor
+
+// ------------------------------------------------------
+// Compiler specific attributes
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+  #if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+    #define mi_attr_noexcept   noexcept
+  #else
+    #define mi_attr_noexcept   throw()
+  #endif
+#else
+  #define mi_attr_noexcept
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201703)
+  #define mi_decl_nodiscard    [[nodiscard]]
+#elif (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)  // includes clang, icc, and clang-cl
+  #define mi_decl_nodiscard    __attribute__((warn_unused_result))
+#elif defined(_HAS_NODISCARD)
+  #define mi_decl_nodiscard    _NODISCARD
+#elif (_MSC_VER >= 1700)
+  #define mi_decl_nodiscard    _Check_return_
+#else
+  #define mi_decl_nodiscard
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+  #if !defined(MI_SHARED_LIB)
+    #define mi_decl_export
+  #elif defined(MI_SHARED_LIB_EXPORT)
+    #define mi_decl_export              __declspec(dllexport)
+  #else
+    #define mi_decl_export              __declspec(dllimport)
+  #endif
+  #if defined(__MINGW32__)
+    #define mi_decl_restrict
+    #define mi_attr_malloc              __attribute__((malloc))
+  #else
+    #if (_MSC_VER >= 1900) && !defined(__EDG__)
+      #define mi_decl_restrict          __declspec(allocator) __declspec(restrict)
+    #else
+      #define mi_decl_restrict          __declspec(restrict)
+    #endif
+    #define mi_attr_malloc
+  #endif
+  #define mi_cdecl                      __cdecl
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #define mi_attr_alloc_align(p)
+#elif defined(__GNUC__)                 // includes clang and icc
+  #if defined(MI_SHARED_LIB) && defined(MI_SHARED_LIB_EXPORT)
+    #define mi_decl_export              __attribute__((visibility("default")))
+  #else
+    #define mi_decl_export
+  #endif
+  #define mi_cdecl                      // leads to warnings... __attribute__((cdecl))
+  #define mi_decl_restrict
+  #define mi_attr_malloc                __attribute__((malloc))
+  #if (defined(__clang_major__) && (__clang_major__ < 4)) || (__GNUC__ < 5)
+    #define mi_attr_alloc_size(s)
+    #define mi_attr_alloc_size2(s1,s2)
+    #define mi_attr_alloc_align(p)
+  #elif defined(__INTEL_COMPILER)
+    #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+    #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+    #define mi_attr_alloc_align(p)
+  #else
+    #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+    #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+    #define mi_attr_alloc_align(p)      __attribute__((alloc_align(p)))
+  #endif
+#else
+  #define mi_cdecl
+  #define mi_decl_export
+  #define mi_decl_restrict
+  #define mi_attr_malloc
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #define mi_attr_alloc_align(p)
+#endif
+
+// ------------------------------------------------------
+// Includes
+// ------------------------------------------------------
+
+#include <stddef.h>     // size_t
+#include <stdbool.h>    // bool
+#include <stdint.h>     // INTPTR_MAX
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------------------------------------------
+// Standard malloc interface
+// ------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc(size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc(size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc(void* p, size_t newsize)      mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_export void* mi_expand(void* p, size_t newsize)                         mi_attr_noexcept mi_attr_alloc_size(2);
+
+mi_decl_export void mi_free(void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
+
+// ------------------------------------------------------
+// Extended functionality
+// ------------------------------------------------------
+#define MI_SMALL_WSIZE_MAX  (128)
+#define MI_SMALL_SIZE_MAX   (MI_SMALL_WSIZE_MAX*sizeof(void*))
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc(size_t size)       mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_reallocn(void* p, size_t count, size_t size)        mi_attr_noexcept mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export void* mi_reallocf(void* p, size_t newsize)                   mi_attr_noexcept mi_attr_alloc_size(2);
+
+mi_decl_nodiscard mi_decl_export size_t mi_usable_size(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size)     mi_attr_noexcept;
+
+
+// ------------------------------------------------------
+// Internals
+// ------------------------------------------------------
+
+typedef void (mi_cdecl mi_deferred_free_fun)(bool force, unsigned long long heartbeat, void* arg);
+mi_decl_export void mi_register_deferred_free(mi_deferred_free_fun* deferred_free, void* arg) mi_attr_noexcept;
+
+typedef void (mi_cdecl mi_output_fun)(const char* msg, void* arg);
+mi_decl_export void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+typedef void (mi_cdecl mi_error_fun)(int err, void* arg);
+mi_decl_export void mi_register_error(mi_error_fun* fun, void* arg);
+
+mi_decl_export void mi_collect(bool force)    mi_attr_noexcept;
+mi_decl_export int  mi_version(void)          mi_attr_noexcept;
+mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
+mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
+mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs,
+                                    size_t* current_rss, size_t* peak_rss,
+                                    size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
+
+// -------------------------------------------------------------------------------------
+// Aligned allocation
+// Note that `alignment` always follows `size` for consistency with unaligned
+// allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
+// -------------------------------------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+
+
+// -------------------------------------------------------------------------------------
+// Heaps: first-class, but can only allocate from the same thread that created it.
+// -------------------------------------------------------------------------------------
+
+struct mi_heap_s;
+typedef struct mi_heap_s mi_heap_t;
+
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new(void);
+mi_decl_export void       mi_heap_delete(mi_heap_t* heap);
+mi_decl_export void       mi_heap_destroy(mi_heap_t* heap);
+mi_decl_export mi_heap_t* mi_heap_set_default(mi_heap_t* heap);
+mi_decl_export mi_heap_t* mi_heap_get_default(void);
+mi_decl_export mi_heap_t* mi_heap_get_backing(void);
+mi_decl_export void       mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)              mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_alloc_size(3);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s)            mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
+
+
+// --------------------------------------------------------------------------------
+// Zero initialized re-allocation.
+// Only valid on memory that was originally allocated with zero initialization too.
+// e.g. `mi_calloc`, `mi_zalloc`, `mi_zalloc_aligned` etc.
+// see <https://github.com/microsoft/mimalloc/issues/63#issuecomment-508272992>
+// --------------------------------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc(void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc(void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(2,3);
+
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(2,3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(2,3);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(3,4) mi_attr_alloc_align(5);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(3,4);
+
+
+// ------------------------------------------------------
+// Analysis
+// ------------------------------------------------------
+
+mi_decl_export bool mi_heap_contains_block(mi_heap_t* heap, const void* p);
+mi_decl_export bool mi_heap_check_owned(mi_heap_t* heap, const void* p);
+mi_decl_export bool mi_check_owned(const void* p);
+
+// An area of heap space contains blocks of a single size.
+typedef struct mi_heap_area_s {
+  void*  blocks;      // start of the area containing heap blocks
+  size_t reserved;    // bytes reserved for this area (virtual)
+  size_t committed;   // current available bytes for this area
+  size_t used;        // number of allocated blocks
+  size_t block_size;  // size in bytes of each block
+  size_t full_block_size; // size in bytes of a full block including padding and metadata.
+} mi_heap_area_t;
+
+typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
+
+mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
+
+// Experimental
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept;
+
+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
+
+mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
+mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+
+mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept;
+
+// Experimental: heaps associated with specific memory arena's
+typedef int mi_arena_id_t;
+mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
+mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+
+#if MI_MALLOC_VERSION >= 182
+// Create a heap that only allocates in the specified arena
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id);
+#endif
+
+// deprecated
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+
+
+// ------------------------------------------------------
+// Convenience
+// ------------------------------------------------------
+
+#define mi_malloc_tp(tp)                ((tp*)mi_malloc(sizeof(tp)))
+#define mi_zalloc_tp(tp)                ((tp*)mi_zalloc(sizeof(tp)))
+#define mi_calloc_tp(tp,n)              ((tp*)mi_calloc(n,sizeof(tp)))
+#define mi_mallocn_tp(tp,n)             ((tp*)mi_mallocn(n,sizeof(tp)))
+#define mi_reallocn_tp(p,tp,n)          ((tp*)mi_reallocn(p,n,sizeof(tp)))
+#define mi_recalloc_tp(p,tp,n)          ((tp*)mi_recalloc(p,n,sizeof(tp)))
+
+#define mi_heap_malloc_tp(hp,tp)        ((tp*)mi_heap_malloc(hp,sizeof(tp)))
+#define mi_heap_zalloc_tp(hp,tp)        ((tp*)mi_heap_zalloc(hp,sizeof(tp)))
+#define mi_heap_calloc_tp(hp,tp,n)      ((tp*)mi_heap_calloc(hp,n,sizeof(tp)))
+#define mi_heap_mallocn_tp(hp,tp,n)     ((tp*)mi_heap_mallocn(hp,n,sizeof(tp)))
+#define mi_heap_reallocn_tp(hp,p,tp,n)  ((tp*)mi_heap_reallocn(hp,p,n,sizeof(tp)))
+#define mi_heap_recalloc_tp(hp,p,tp,n)  ((tp*)mi_heap_recalloc(hp,p,n,sizeof(tp)))
+
+
+// ------------------------------------------------------
+// Options
+// ------------------------------------------------------
+
+typedef enum mi_option_e {
+  // stable options
+  mi_option_show_errors,              // print error messages
+  mi_option_show_stats,               // print statistics on termination
+  mi_option_verbose,                  // print verbose messages
+  // the following options are experimental (see src/options.h)
+  mi_option_eager_commit,             // eager commit segments? (after `eager_commit_delay` segments) (=1)
+  mi_option_arena_eager_commit,       // eager commit arenas? Use 2 to enable just on overcommit systems (=2)
+  mi_option_purge_decommits,          // should a memory purge decommit (or only reset) (=1)
+  mi_option_allow_large_os_pages,     // allow large (2MiB) OS pages, implies eager commit
+  mi_option_reserve_huge_os_pages,    // reserve N huge OS pages (1GiB/page) at startup
+  mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node
+  mi_option_reserve_os_memory,        // reserve specified amount of OS memory in an arena at startup
+  mi_option_deprecated_segment_cache,
+  mi_option_deprecated_page_reset,
+  mi_option_abandoned_page_purge,     // immediately purge delayed purges on thread termination
+  mi_option_deprecated_segment_reset,
+  mi_option_eager_commit_delay,
+  mi_option_purge_delay,              // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all.
+  mi_option_use_numa_nodes,           // 0 = use all available numa nodes, otherwise use at most N nodes.
+  mi_option_limit_os_alloc,           // 1 = do not use OS memory for allocation (but only programmatically reserved arenas)
+  mi_option_os_tag,                   // tag used for OS logging (macOS only for now)
+  mi_option_max_errors,               // issue at most N error messages
+  mi_option_max_warnings,             // issue at most N warning messages
+  mi_option_max_segment_reclaim,
+  mi_option_destroy_on_exit,          // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe.
+  mi_option_arena_reserve,            // initial memory size in KiB for arena reservation (1GiB on 64-bit)
+  mi_option_arena_purge_mult,
+  mi_option_purge_extend_delay,
+  _mi_option_last,
+  // legacy option names
+  mi_option_large_os_pages = mi_option_allow_large_os_pages,
+  mi_option_eager_region_commit = mi_option_arena_eager_commit,
+  mi_option_reset_decommits = mi_option_purge_decommits,
+  mi_option_reset_delay = mi_option_purge_delay,
+  mi_option_abandoned_page_reset = mi_option_abandoned_page_purge
+} mi_option_t;
+
+
+mi_decl_nodiscard mi_decl_export bool mi_option_is_enabled(mi_option_t option);
+mi_decl_export void mi_option_enable(mi_option_t option);
+mi_decl_export void mi_option_disable(mi_option_t option);
+mi_decl_export void mi_option_set_enabled(mi_option_t option, bool enable);
+mi_decl_export void mi_option_set_enabled_default(mi_option_t option, bool enable);
+
+mi_decl_nodiscard mi_decl_export long   mi_option_get(mi_option_t option);
+mi_decl_nodiscard mi_decl_export long   mi_option_get_clamp(mi_option_t option, long min, long max);
+mi_decl_nodiscard mi_decl_export size_t mi_option_get_size(mi_option_t option);
+mi_decl_export void mi_option_set(mi_option_t option, long value);
+mi_decl_export void mi_option_set_default(mi_option_t option, long value);
+
+
+// -------------------------------------------------------------------------------------------------------
+// "mi" prefixed implementations of various posix, Unix, Windows, and C++ allocation functions.
+// (This can be convenient when providing overrides of these functions as done in `mimalloc-override.h`.)
+// note: we use `mi_cfree` as "checked free" and it checks if the pointer is in our heap before free-ing.
+// -------------------------------------------------------------------------------------------------------
+
+mi_decl_export void  mi_cfree(void* p) mi_attr_noexcept;
+mi_decl_export void* mi__expand(void* p, size_t newsize) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_size(const void* p)        mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_good_size(size_t size)     mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
+
+mi_decl_export int mi_posix_memalign(void** p, size_t alignment, size_t size)   mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_valloc(size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+
+mi_decl_nodiscard mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export int   mi_reallocarr(void* p, size_t count, size_t size) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned char*  mi_mbsdup(const unsigned char* s)  mi_attr_noexcept mi_attr_malloc;
+mi_decl_export int mi_dupenv_s(char** buf, size_t* size, const char* name)                      mi_attr_noexcept;
+mi_decl_export int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept;
+
+mi_decl_export void mi_free_size(void* p, size_t size)                           mi_attr_noexcept;
+mi_decl_export void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_export void mi_free_aligned(void* p, size_t alignment)                   mi_attr_noexcept;
+
+// The `mi_new` wrappers implement C++ semantics on out-of-memory instead of directly returning `NULL`.
+// (and call `std::get_new_handler` and potentially raise a `std::bad_alloc` exception).
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new(size_t size)                   mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_nothrow(size_t size)           mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_n(size_t count, size_t size)   mi_attr_malloc mi_attr_alloc_size2(1, 2);
+mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize)                mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_alloc_size2(2, 3);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size)                mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(2, 3);
+
+#ifdef __cplusplus
+}
+#endif
+
+// ---------------------------------------------------------------------------------------------
+// Implement the C++ std::allocator interface for use in STL containers.
+// (note: see `mimalloc-new-delete.h` for overriding the new/delete operators globally)
+// ---------------------------------------------------------------------------------------------
+#ifdef __cplusplus
+
+#include <cstddef>     // std::size_t
+#include <cstdint>     // PTRDIFF_MAX
+#if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+#include <type_traits> // std::true_type
+#include <utility>     // std::forward
+#endif
+
+template<class T> struct _mi_stl_allocator_common {
+  typedef T                 value_type;
+  typedef std::size_t       size_type;
+  typedef std::ptrdiff_t    difference_type;
+  typedef value_type&       reference;
+  typedef value_type const& const_reference;
+  typedef value_type*       pointer;
+  typedef value_type const* const_pointer;
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using propagate_on_container_copy_assignment = std::true_type;
+  using propagate_on_container_move_assignment = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
+  template <class U, class ...Args> void construct(U* p, Args&& ...args) { ::new(p) U(std::forward<Args>(args)...); }
+  template <class U> void destroy(U* p) mi_attr_noexcept { p->~U(); }
+  #else
+  void construct(pointer p, value_type const& val) { ::new(p) value_type(val); }
+  void destroy(pointer p) { p->~value_type(); }
+  #endif
+
+  size_type     max_size() const mi_attr_noexcept { return (PTRDIFF_MAX/sizeof(value_type)); }
+  pointer       address(reference x) const        { return &x; }
+  const_pointer address(const_reference x) const  { return &x; }
+};
+
+template<class T> struct mi_stl_allocator : public _mi_stl_allocator_common<T> {
+  using typename _mi_stl_allocator_common<T>::size_type;
+  using typename _mi_stl_allocator_common<T>::value_type;
+  using typename _mi_stl_allocator_common<T>::pointer;
+  template <class U> struct rebind { typedef mi_stl_allocator<U> other; };
+
+  mi_stl_allocator()                                             mi_attr_noexcept = default;
+  mi_stl_allocator(const mi_stl_allocator&)                      mi_attr_noexcept = default;
+  template<class U> mi_stl_allocator(const mi_stl_allocator<U>&) mi_attr_noexcept { }
+  mi_stl_allocator  select_on_container_copy_construction() const { return *this; }
+  void              deallocate(T* p, size_type) { mi_free(p); }
+
+  #if (__cplusplus >= 201703L)  // C++17
+  mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_new_n(count, sizeof(T))); }
+  mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); }
+  #else
+  mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_new_n(count, sizeof(value_type))); }
+  #endif
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using is_always_equal = std::true_type;
+  #endif
+};
+
+template<class T1,class T2> bool operator==(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return true; }
+template<class T1,class T2> bool operator!=(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return false; }
+
+
+#if (__cplusplus >= 201103L) || (_MSC_VER >= 1900)  // C++11
+#define MI_HAS_HEAP_STL_ALLOCATOR 1
+
+#include <memory>      // std::shared_ptr
+
+// Common base class for STL allocators in a specific heap
+template<class T, bool _mi_destroy> struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common<T> {
+  using typename _mi_stl_allocator_common<T>::size_type;
+  using typename _mi_stl_allocator_common<T>::value_type;
+  using typename _mi_stl_allocator_common<T>::pointer;
+
+  _mi_heap_stl_allocator_common(mi_heap_t* hp) : heap(hp) { }    /* will not delete nor destroy the passed in heap */
+
+  #if (__cplusplus >= 201703L)  // C++17
+  mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(T))); }
+  mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); }
+  #else
+  mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(value_type))); }
+  #endif
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using is_always_equal = std::false_type;
+  #endif
+
+  void collect(bool force) { mi_heap_collect(this->heap.get(), force); }
+  template<class U> bool is_equal(const _mi_heap_stl_allocator_common<U, _mi_destroy>& x) const { return (this->heap == x.heap); }
+
+protected:
+  std::shared_ptr<mi_heap_t> heap;
+  template<class U, bool D> friend struct _mi_heap_stl_allocator_common;
+
+  _mi_heap_stl_allocator_common() {
+    mi_heap_t* hp = mi_heap_new();
+    this->heap.reset(hp, (_mi_destroy ? &heap_destroy : &heap_delete));  /* calls heap_delete/destroy when the refcount drops to zero */
+  }
+  _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common& x) mi_attr_noexcept : heap(x.heap) { }
+  template<class U> _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common<U, _mi_destroy>& x) mi_attr_noexcept : heap(x.heap) { }
+
+private:
+  static void heap_delete(mi_heap_t* hp)  { if (hp != NULL) { mi_heap_delete(hp); } }
+  static void heap_destroy(mi_heap_t* hp) { if (hp != NULL) { mi_heap_destroy(hp); } }
+};
+
+// STL allocator allocation in a specific heap
+template<class T> struct mi_heap_stl_allocator : public _mi_heap_stl_allocator_common<T, false> {
+  using typename _mi_heap_stl_allocator_common<T, false>::size_type;
+  mi_heap_stl_allocator() : _mi_heap_stl_allocator_common<T, false>() { } // creates fresh heap that is deleted when the destructor is called
+  mi_heap_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, false>(hp) { }  // no delete nor destroy on the passed in heap
+  template<class U> mi_heap_stl_allocator(const mi_heap_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, false>(x) { }
+
+  mi_heap_stl_allocator select_on_container_copy_construction() const { return *this; }
+  void deallocate(T* p, size_type) { mi_free(p); }
+  template<class U> struct rebind { typedef mi_heap_stl_allocator<U> other; };
+};
+
+template<class T1, class T2> bool operator==(const mi_heap_stl_allocator<T1>& x, const mi_heap_stl_allocator<T2>& y) mi_attr_noexcept { return (x.is_equal(y)); }
+template<class T1, class T2> bool operator!=(const mi_heap_stl_allocator<T1>& x, const mi_heap_stl_allocator<T2>& y) mi_attr_noexcept { return (!x.is_equal(y)); }
+
+
+// STL allocator allocation in a specific heap, where `free` does nothing and
+// the heap is destroyed in one go on destruction -- use with care!
+template<class T> struct mi_heap_destroy_stl_allocator : public _mi_heap_stl_allocator_common<T, true> {
+  using typename _mi_heap_stl_allocator_common<T, true>::size_type;
+  mi_heap_destroy_stl_allocator() : _mi_heap_stl_allocator_common<T, true>() { } // creates fresh heap that is destroyed when the destructor is called
+  mi_heap_destroy_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, true>(hp) { }  // no delete nor destroy on the passed in heap
+  template<class U> mi_heap_destroy_stl_allocator(const mi_heap_destroy_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, true>(x) { }
+
+  mi_heap_destroy_stl_allocator select_on_container_copy_construction() const { return *this; }
+  void deallocate(T*, size_type) { /* do nothing as we destroy the heap on destruct. */ }
+  template<class U> struct rebind { typedef mi_heap_destroy_stl_allocator<U> other; };
+};
+
+template<class T1, class T2> bool operator==(const mi_heap_destroy_stl_allocator<T1>& x, const mi_heap_destroy_stl_allocator<T2>& y) mi_attr_noexcept { return (x.is_equal(y)); }
+template<class T1, class T2> bool operator!=(const mi_heap_destroy_stl_allocator<T1>& x, const mi_heap_destroy_stl_allocator<T2>& y) mi_attr_noexcept { return (!x.is_equal(y)); }
+
+#endif // C++11
+
+#endif // __cplusplus
+
+#endif
diff --git a/Include/internal/mimalloc/mimalloc/atomic.h b/Include/internal/mimalloc/mimalloc/atomic.h
new file mode 100644
index 0000000000000000000000000000000000000000..a46a7676ad20b8feb830f6429364dfce176aae0f
--- /dev/null
+++ b/Include/internal/mimalloc/mimalloc/atomic.h
@@ -0,0 +1,392 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_ATOMIC_H
+#define MIMALLOC_ATOMIC_H
+
+// --------------------------------------------------------------------------------------------
+// Atomics
+// We need to be portable between C, C++, and MSVC.
+// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode.
+// This is why we try to use only `uintptr_t` and `<type>*` as atomic types.
+// To gain better insight in the range of used atomics, we use explicitly named memory order operations
+// instead of passing the memory order as a parameter.
+// -----------------------------------------------------------------------------------------------
+
+#if defined(__cplusplus)
+// Use C++ atomics
+#include <atomic>
+#define  _Atomic(tp)            std::atomic<tp>
+#define  mi_atomic(name)        std::atomic_##name
+#define  mi_memory_order(name)  std::memory_order_##name
+#if (__cplusplus >= 202002L)    // c++20, see issue #571
+ #define MI_ATOMIC_VAR_INIT(x)  x
+#elif !defined(ATOMIC_VAR_INIT)
+ #define MI_ATOMIC_VAR_INIT(x)  x
+#else
+ #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#endif
+#elif defined(_MSC_VER)
+// Use MSVC C wrapper for C11 atomics
+#define  _Atomic(tp)            tp
+#define  MI_ATOMIC_VAR_INIT(x)  x
+#define  mi_atomic(name)        mi_atomic_##name
+#define  mi_memory_order(name)  mi_memory_order_##name
+#else
+// Use C11 atomics
+#include <stdatomic.h>
+#define  mi_atomic(name)        atomic_##name
+#define  mi_memory_order(name)  memory_order_##name
+#if (__STDC_VERSION__ >= 201710L) // c17, see issue #735
+ #define MI_ATOMIC_VAR_INIT(x) x
+#elif !defined(ATOMIC_VAR_INIT)
+ #define MI_ATOMIC_VAR_INIT(x) x
+#else
+ #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x)
+#endif
+#endif
+
+// Various defines for all used memory orders in mimalloc
+#define mi_atomic_cas_weak(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_weak_explicit)(p,expected,desired,mem_success,mem_fail)
+
+#define mi_atomic_cas_strong(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_strong_explicit)(p,expected,desired,mem_success,mem_fail)
+
+#define mi_atomic_load_acquire(p)                mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_load_relaxed(p)                mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_store_release(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+
+#define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
+#define mi_atomic_decrement_relaxed(p)           mi_atomic_sub_relaxed(p,(uintptr_t)1)
+#define mi_atomic_increment_acq_rel(p)           mi_atomic_add_acq_rel(p,(uintptr_t)1)
+#define mi_atomic_decrement_acq_rel(p)           mi_atomic_sub_acq_rel(p,(uintptr_t)1)
+
+static inline void mi_atomic_yield(void);
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add);
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
+
+
+#if defined(__cplusplus) || !defined(_MSC_VER)
+
+// In C++/C11 atomics we have polymorphic atomics so can use the typed `ptr` variants (where `tp` is the type of atomic value)
+// We use these macros so we can provide a typed wrapper in MSVC in C compilation mode as well
+#define mi_atomic_load_ptr_acquire(tp,p)                mi_atomic_load_acquire(p)
+#define mi_atomic_load_ptr_relaxed(tp,p)                mi_atomic_load_relaxed(p)
+
+// In C++ we need to add casts to help resolve templates if NULL is passed
+#if defined(__cplusplus)
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,(tp*)x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,(tp*)x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
+#else
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
+#endif
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
+  return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
+}
+static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
+  int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p);
+  while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, &current, x)) { /* nothing */ };
+}
+
+// Used by timers
+#define mi_atomic_loadi64_acquire(p)            mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)            mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x)         mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x)         mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+
+#define mi_atomic_casi64_strong_acq_rel(p,e,d)  mi_atomic_cas_strong_acq_rel(p,e,d)
+#define mi_atomic_addi64_acq_rel(p,i)           mi_atomic_add_acq_rel(p,i)
+
+
+#elif defined(_MSC_VER)
+
+// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics.
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <intrin.h>
+#ifdef _WIN64
+typedef LONG64   msc_intptr_t;
+#define MI_64(f) f##64
+#else
+typedef LONG     msc_intptr_t;
+#define MI_64(f) f
+#endif
+
+typedef enum mi_memory_order_e {
+  mi_memory_order_relaxed,
+  mi_memory_order_consume,
+  mi_memory_order_acquire,
+  mi_memory_order_release,
+  mi_memory_order_acq_rel,
+  mi_memory_order_seq_cst
+} mi_memory_order;
+
+static inline uintptr_t mi_atomic_fetch_add_explicit(_Atomic(uintptr_t)*p, uintptr_t add, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+}
+static inline uintptr_t mi_atomic_fetch_sub_explicit(_Atomic(uintptr_t)*p, uintptr_t sub, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub));
+}
+static inline uintptr_t mi_atomic_fetch_and_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline uintptr_t mi_atomic_fetch_or_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline bool mi_atomic_compare_exchange_strong_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  (void)(mo1); (void)(mo2);
+  uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected));
+  if (read == *expected) {
+    return true;
+  }
+  else {
+    *expected = read;
+    return false;
+  }
+}
+static inline bool mi_atomic_compare_exchange_weak_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  return mi_atomic_compare_exchange_strong_explicit(p, expected, desired, mo1, mo2);
+}
+static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)*p, uintptr_t exchange, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+}
+static inline void mi_atomic_thread_fence(mi_memory_order mo) {
+  (void)(mo);
+  _Atomic(uintptr_t) x = 0;
+  mi_atomic_exchange_explicit(&x, 1, mo);
+}
+static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
+  return *p;
+#else
+  uintptr_t x = *p;
+  if (mo > mi_memory_order_relaxed) {
+    while (!mi_atomic_compare_exchange_weak_explicit((_Atomic(uintptr_t)*)p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
+  }
+  return x;
+#endif
+}
+static inline void mi_atomic_store_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  mi_atomic_exchange_explicit(p, x, mo);
+#endif
+}
+static inline int64_t mi_atomic_loadi64_explicit(_Atomic(int64_t)*p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_X64)
+  return *p;
+#else
+  int64_t old = *p;
+  int64_t x = old;
+  while ((old = InterlockedCompareExchange64(p, x, old)) != x) {
+    x = old;
+  }
+  return x;
+#endif
+}
+static inline void mi_atomic_storei64_explicit(_Atomic(int64_t)*p, int64_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(x_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  InterlockedExchange64(p, x);
+#endif
+}
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int64_t add) {
+#ifdef _WIN64
+  return (int64_t)mi_atomic_addi((int64_t*)p, add);
+#else
+  int64_t current;
+  int64_t sum;
+  do {
+    current = *p;
+    sum = current + add;
+  } while (_InterlockedCompareExchange64(p, sum, current) != current);
+  return current;
+#endif
+}
+static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
+  int64_t current;
+  do {
+    current = *p;
+  } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
+}
+
+static inline void mi_atomic_addi64_acq_rel(volatile _Atomic(int64_t*)p, int64_t i) {
+  mi_atomic_addi64_relaxed(p, i);
+}
+
+static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p, int64_t* exp, int64_t des) {
+  int64_t read = _InterlockedCompareExchange64(p, des, *exp);
+  if (read == *exp) {
+    return true;
+  }
+  else {
+    *exp = read;
+    return false;
+  }
+}
+
+// The pointer macros cast to `uintptr_t`.
+#define mi_atomic_load_ptr_acquire(tp,p)                (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_load_ptr_relaxed(tp,p)                (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+
+#define mi_atomic_loadi64_acquire(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(relaxed))
+
+
+#endif
+
+
+// Atomically add a signed value; returns the previous value.
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add) {
+  return (intptr_t)mi_atomic_add_acq_rel((_Atomic(uintptr_t)*)p, (uintptr_t)add);
+}
+
+// Atomically subtract a signed value; returns the previous value.
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
+  return (intptr_t)mi_atomic_addi(p, -sub);
+}
+
+typedef _Atomic(uintptr_t) mi_atomic_once_t;
+
+// Returns true only on the first invocation
+static inline bool mi_atomic_once( mi_atomic_once_t* once ) {
+  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1
+}
+
+typedef _Atomic(uintptr_t) mi_atomic_guard_t;
+
+// Allows only one thread to execute at a time
+#define mi_atomic_guard(guard) \
+  uintptr_t _mi_guard_expected = 0; \
+  for(bool _mi_guard_once = true; \
+      _mi_guard_once && mi_atomic_cas_strong_acq_rel(guard,&_mi_guard_expected,(uintptr_t)1); \
+      (mi_atomic_store_release(guard,(uintptr_t)0), _mi_guard_once = false) )
+
+
+
+// Yield
+#if defined(__cplusplus)
+#include <thread>
+static inline void mi_atomic_yield(void) {
+  std::this_thread::yield();
+}
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline void mi_atomic_yield(void) {
+  YieldProcessor();
+}
+#elif defined(__SSE2__)
+#include <emmintrin.h>
+static inline void mi_atomic_yield(void) {
+  _mm_pause();
+}
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+      (defined(__x86_64__) || defined(__i386__) || \
+       defined(__aarch64__) || defined(__arm__) || \
+       defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__))
+#if defined(__x86_64__) || defined(__i386__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("pause" ::: "memory");
+}
+#elif defined(__aarch64__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("wfe");
+}
+#elif defined(__arm__)
+#if __ARM_ARCH >= 7
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("yield" ::: "memory");
+}
+#else
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("nop" ::: "memory");
+}
+#endif
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__)
+#ifdef __APPLE__
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("or r27,r27,r27" ::: "memory");
+}
+#else
+static inline void mi_atomic_yield(void) {
+  __asm__ __volatile__ ("or 27,27,27" ::: "memory");
+}
+#endif
+#endif
+#elif defined(__sun)
+// Fallback for other archs
+#include <synch.h>
+static inline void mi_atomic_yield(void) {
+  smt_pause();
+}
+#elif defined(__wasi__)
+#include <sched.h>
+static inline void mi_atomic_yield(void) {
+  sched_yield();
+}
+#else
+#include <unistd.h>
+static inline void mi_atomic_yield(void) {
+  sleep(0);
+}
+#endif
+
+
+#endif // __MIMALLOC_ATOMIC_H
diff --git a/Include/internal/mimalloc/mimalloc/internal.h b/Include/internal/mimalloc/mimalloc/internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c16152d914509ffb6d6b8da4d778a318b84a45c
--- /dev/null
+++ b/Include/internal/mimalloc/mimalloc/internal.h
@@ -0,0 +1,969 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_INTERNAL_H
+#define MIMALLOC_INTERNAL_H
+
+
+// --------------------------------------------------------------------------
+// This file contains the interal API's of mimalloc and various utility
+// functions and macros.
+// --------------------------------------------------------------------------
+
+#include "types.h"
+#include "track.h"
+
+#if (MI_DEBUG>0)
+#define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
+#else
+#define mi_trace_message(...)
+#endif
+
+#if defined(__EMSCRIPTEN__) && !defined(__wasi__)
+#define __wasi__
+#endif
+
+#if defined(__cplusplus)
+#define mi_decl_externc       extern "C"
+#else
+#define mi_decl_externc
+#endif
+
+// pthreads
+#if !defined(_WIN32) && !defined(__wasi__)
+#define  MI_USE_PTHREADS
+#include <pthread.h>
+#endif
+
+// "options.c"
+void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
+void       _mi_warning_message(const char* fmt, ...);
+void       _mi_verbose_message(const char* fmt, ...);
+void       _mi_trace_message(const char* fmt, ...);
+void       _mi_options_init(void);
+void       _mi_error_message(int err, const char* fmt, ...);
+
+// random.c
+void       _mi_random_init(mi_random_ctx_t* ctx);
+void       _mi_random_init_weak(mi_random_ctx_t* ctx);
+void       _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
+void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t  _mi_os_random_weak(uintptr_t extra_seed);
+static inline uintptr_t _mi_random_shuffle(uintptr_t x);
+
+// init.c
+extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
+extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
+bool       _mi_is_main_thread(void);
+size_t     _mi_current_thread_count(void);
+bool       _mi_preloading(void);           // true while the C runtime is not initialized yet
+mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
+mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
+void       _mi_thread_done(mi_heap_t* heap);
+void       _mi_thread_data_collect(void);
+void       _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
+
+// os.c
+void       _mi_os_init(void);                                            // called from process init
+void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
+void       _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats);
+void       _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats);
+
+size_t     _mi_os_page_size(void);
+size_t     _mi_os_good_alloc_size(size_t size);
+bool       _mi_os_has_overcommit(void);
+bool       _mi_os_has_virtual_reserve(void);
+
+bool       _mi_os_purge(void* p, size_t size, mi_stats_t* stats);
+bool       _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats);
+bool       _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+bool       _mi_os_protect(void* addr, size_t size);
+bool       _mi_os_unprotect(void* addr, size_t size);
+bool       _mi_os_purge(void* p, size_t size, mi_stats_t* stats);
+bool       _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats);
+
+void*      _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats);
+void*      _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats);
+
+void*      _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
+bool       _mi_os_use_large_page(size_t size, size_t alignment);
+size_t     _mi_os_large_page_size(void);
+
+void*      _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
+
+// arena.c
+mi_arena_id_t _mi_arena_id_none(void);
+void       _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats);
+void*      _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
+void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
+bool       _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
+bool       _mi_arena_contains(const void* p);
+void       _mi_arena_collect(bool force_purge, mi_stats_t* stats);
+void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
+
+// "segment-map.c"
+void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
+void       _mi_segment_map_freed_at(const mi_segment_t* segment);
+
+// "segment.c"
+extern mi_abandoned_pool_t _mi_abandoned_default;  // global abandoned pool
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
+void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
+void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
+bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
+void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
+bool       _mi_abandoned_pool_visit_blocks(mi_abandoned_pool_t* pool, uint8_t page_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+
+
+#if MI_HUGE_PAGE_ABANDON
+void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+#else
+void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+#endif
+
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page
+void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
+void       _mi_abandoned_await_readers(mi_abandoned_pool_t *pool);
+void       _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
+
+// "page.c"
+void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
+
+void       _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
+void       _mi_page_unfull(mi_page_t* page);
+void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
+void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
+void       _mi_heap_delayed_free_all(mi_heap_t* heap);
+bool       _mi_heap_delayed_free_partial(mi_heap_t* heap);
+void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
+
+void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+bool       _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void       _mi_deferred_free(mi_heap_t* heap, bool force);
+
+void       _mi_page_free_collect(mi_page_t* page,bool force);
+void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
+
+size_t     _mi_bin_size(uint8_t bin);           // for stats
+uint8_t    _mi_bin(size_t size);                // for stats
+
+// "heap.c"
+void       _mi_heap_init_ex(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool no_reclaim, uint8_t tag);
+void       _mi_heap_destroy_pages(mi_heap_t* heap);
+void       _mi_heap_collect_abandon(mi_heap_t* heap);
+void       _mi_heap_set_default_direct(mi_heap_t* heap);
+bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
+void       _mi_heap_unsafe_destroy_all(void);
+void       _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page);
+bool       _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t *page, mi_block_visit_fun* visitor, void* arg);
+
+// "stats.c"
+void       _mi_stats_done(mi_stats_t* stats);
+mi_msecs_t  _mi_clock_now(void);
+mi_msecs_t  _mi_clock_end(mi_msecs_t start);
+mi_msecs_t  _mi_clock_start(void);
+
+// "alloc.c"
+void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
+void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
+void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
+bool        _mi_free_delayed_block(mi_block_t* block);
+void        _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
+void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
+
+// option.c, c primitives
+char        _mi_toupper(char c);
+int         _mi_strnicmp(const char* s, const char* t, size_t n);
+void        _mi_strlcpy(char* dest, const char* src, size_t dest_size);
+void        _mi_strlcat(char* dest, const char* src, size_t dest_size);
+size_t      _mi_strlen(const char* s);
+size_t      _mi_strnlen(const char* s, size_t max_len);
+
+
+#if MI_DEBUG>1
+bool        _mi_page_is_valid(mi_page_t* page);
+#endif
+
+
+// ------------------------------------------------------
+// Branches
+// ------------------------------------------------------
+
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
+#define mi_likely(x)       (__builtin_expect(!!(x),true))
+#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define mi_unlikely(x)     (x) [[unlikely]]
+#define mi_likely(x)       (x) [[likely]]
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+
+/* -----------------------------------------------------------
+  Error codes passed to `_mi_fatal_error`
+  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
+  For portability define undefined error codes using common Unix codes:
+  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+----------------------------------------------------------- */
+#include <errno.h>
+#ifndef EAGAIN         // double free
+#define EAGAIN (11)
+#endif
+#ifndef ENOMEM         // out of memory
+#define ENOMEM (12)
+#endif
+#ifndef EFAULT         // corrupted free-list or meta-data
+#define EFAULT (14)
+#endif
+#ifndef EINVAL         // trying to free an invalid pointer
+#define EINVAL (22)
+#endif
+#ifndef EOVERFLOW      // count*size overflow
+#define EOVERFLOW (75)
+#endif
+
+
+/* -----------------------------------------------------------
+  Inlined definitions
+----------------------------------------------------------- */
+#define MI_UNUSED(x)     (void)(x)
+#if (MI_DEBUG>0)
+#define MI_UNUSED_RELEASE(x)
+#else
+#define MI_UNUSED_RELEASE(x)  MI_UNUSED(x)
+#endif
+
+#define MI_INIT4(x)   x(),x(),x(),x()
+#define MI_INIT8(x)   MI_INIT4(x),MI_INIT4(x)
+#define MI_INIT16(x)  MI_INIT8(x),MI_INIT8(x)
+#define MI_INIT32(x)  MI_INIT16(x),MI_INIT16(x)
+#define MI_INIT64(x)  MI_INIT32(x),MI_INIT32(x)
+#define MI_INIT128(x) MI_INIT64(x),MI_INIT64(x)
+#define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
+
+
+#include <string.h>
+// initialize a local variable to zero; use memset as compilers optimize constant sized memset's
+#define _mi_memzero_var(x)  memset(&x,0,sizeof(x))
+
+// Is `x` a power of two? (0 is considered a power of two)
+static inline bool _mi_is_power_of_two(uintptr_t x) {
+  return ((x & (x - 1)) == 0);
+}
+
+// Is a pointer aligned?
+static inline bool _mi_is_aligned(void* p, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  return (((uintptr_t)p % alignment) == 0);
+}
+
+// Align upwards
+static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) {  // power of two?
+    return ((sz + mask) & ~mask);
+  }
+  else {
+    return (((sz + mask)/alignment)*alignment);
+  }
+}
+
+// Align downwards
+static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) { // power of two?
+    return (sz & ~mask);
+  }
+  else {
+    return ((sz / alignment) * alignment);
+  }
+}
+
+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
+// Is memory zero initialized?
+static inline bool mi_mem_is_zero(const void* p, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    if (((uint8_t*)p)[i] != 0) return false;
+  }
+  return true;
+}
+
+
+// Align a byte size to a size in _machine words_,
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline size_t _mi_wsize_from_size(size_t size) {
+  mi_assert_internal(size <= SIZE_MAX - sizeof(uintptr_t));
+  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
+}
+
+// Overflow detecting multiply
+#if __has_builtin(__builtin_umul_overflow) || (defined(__GNUC__) && (__GNUC__ >= 5))
+#include <limits.h>      // UINT_MAX, ULONG_MAX
+#if defined(_CLOCK_T)    // for Illumos
+#undef _CLOCK_T
+#endif
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  #if (SIZE_MAX == ULONG_MAX)
+    return __builtin_umull_overflow(count, size, (unsigned long *)total);
+  #elif (SIZE_MAX == UINT_MAX)
+    return __builtin_umul_overflow(count, size, (unsigned int *)total);
+  #else
+    return __builtin_umulll_overflow(count, size, (unsigned long long *)total);
+  #endif
+}
+#else /* __builtin_umul_overflow is unavailable */
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
+  *total = count * size;
+  // note: gcc/clang optimize this to directly check the overflow flag
+  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW) && size > 0 && (SIZE_MAX / size) < count);
+}
+#endif
+
+// Safe multiply `count*size` into `total`; return `true` on overflow.
+static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* total) {
+  if (count==1) {  // quick check for the case where count is one (common for C++ allocators)
+    *total = size;
+    return false;
+  }
+  else if mi_unlikely(mi_mul_overflow(count, size, total)) {
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu * %zu bytes)\n", count, size);
+    #endif
+    *total = SIZE_MAX;
+    return true;
+  }
+  else return false;
+}
+
+
+/*----------------------------------------------------------------------------------------
+  Heap functions
+------------------------------------------------------------------------------------------- */
+
+extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
+
+static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
+  return (heap->tld->heap_backing == heap);
+}
+
+static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  return (heap != &_mi_heap_empty);
+}
+
+static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  extern mi_heap_t _mi_heap_main;
+  mi_assert_internal(_mi_heap_main.cookie != 0);
+  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+}
+
+/* -----------------------------------------------------------
+  Pages
+----------------------------------------------------------- */
+
+static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
+  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+  const size_t idx = _mi_wsize_from_size(size);
+  mi_assert_internal(idx < MI_PAGES_DIRECT);
+  return heap->pages_free_direct[idx];
+}
+
+// Segment that contains the pointer
+// Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
+// and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
+// therefore we align one byte before `p`.
+static inline mi_segment_t* _mi_ptr_segment(const void* p) {
+  mi_assert_internal(p != NULL);
+  return (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
+}
+
+static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) {
+  mi_assert_internal(s->slice_offset== 0 && s->slice_count > 0);
+  return (mi_page_t*)(s);
+}
+
+static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) {
+  mi_assert_internal(p->slice_offset== 0 && p->slice_count > 0);
+  return (mi_slice_t*)(p);
+}
+
+// Segment belonging to a page
+static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
+  mi_segment_t* segment = _mi_ptr_segment(page);
+  mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries));
+  return segment;
+}
+
+static inline mi_slice_t* mi_slice_first(const mi_slice_t* slice) {
+  mi_slice_t* start = (mi_slice_t*)((uint8_t*)slice - slice->slice_offset);
+  mi_assert_internal(start >= _mi_ptr_segment(slice)->slices);
+  mi_assert_internal(start->slice_offset == 0);
+  mi_assert_internal(start + start->slice_count > slice);
+  return start;
+}
+
+// Get the page containing the pointer (performance critical as it is called in mi_free)
+static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
+  mi_assert_internal(p > (void*)segment);
+  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
+  mi_assert_internal(diff > 0 && diff <= (ptrdiff_t)MI_SEGMENT_SIZE);
+  size_t idx = (size_t)diff >> MI_SEGMENT_SLICE_SHIFT;
+  mi_assert_internal(idx <= segment->slice_entries);
+  mi_slice_t* slice0 = (mi_slice_t*)&segment->slices[idx];
+  mi_slice_t* slice = mi_slice_first(slice0);  // adjust to the block that holds the page data
+  mi_assert_internal(slice->slice_offset == 0);
+  mi_assert_internal(slice >= segment->slices && slice < segment->slices + segment->slice_entries);
+  return mi_slice_to_page(slice);
+}
+
+// Quick page start for initialized pages
+static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
+  return _mi_segment_page_start(segment, page, page_size);
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_ptr_page(void* p) {
+  return _mi_segment_page_of(_mi_ptr_segment(p), p);
+}
+
+// Get the block size of a page (special case for huge objects)
+static inline size_t mi_page_block_size(const mi_page_t* page) {
+  const size_t bsize = page->xblock_size;
+  mi_assert_internal(bsize > 0);
+  if mi_likely(bsize < MI_HUGE_BLOCK_SIZE) {
+    return bsize;
+  }
+  else {
+    size_t psize;
+    _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+    return psize;
+  }
+}
+
+static inline bool mi_page_is_huge(const mi_page_t* page) {
+  return (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
+}
+
+// Get the usable block size of a page without fixed padding.
+// This may still include internal padding due to alignment and rounding up size classes.
+static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
+  return mi_page_block_size(page) - MI_PADDING_SIZE;
+}
+
+// size of a segment
+static inline size_t mi_segment_size(mi_segment_t* segment) {
+  return segment->segment_slices * MI_SEGMENT_SLICE_SIZE;
+}
+
+static inline uint8_t* mi_segment_end(mi_segment_t* segment) {
+  return (uint8_t*)segment + mi_segment_size(segment);
+}
+
+// Thread free access
+static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
+  return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
+}
+
+static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
+  return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3);
+}
+
+// Heap access
+static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
+  return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
+}
+
+static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
+  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
+  mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
+}
+
+// Thread free flag helpers
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~0x03);
+}
+static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) {
+  return (mi_delayed_t)(tf & 0x03);
+}
+static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) {
+  return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed);
+}
+static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
+  return mi_tf_make(mi_tf_block(tf),delayed);
+}
+static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
+  return mi_tf_make(block, mi_tf_delayed(tf));
+}
+
+// are all blocks in a page freed?
+// note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
+static inline bool mi_page_all_free(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->used == 0);
+}
+
+// are there any available blocks?
+static inline bool mi_page_has_any_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL && page->reserved > 0);
+  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
+}
+
+// are there immediately available blocks, i.e. blocks available on the free list.
+static inline bool mi_page_immediate_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->free != NULL);
+}
+
+// is more than 7/8th of a page in use?
+static inline bool mi_page_mostly_used(const mi_page_t* page) {
+  if (page==NULL) return true;
+  uint16_t frac = page->reserved / 8U;
+  return (page->reserved - page->used <= frac);
+}
+
+static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
+  return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
+}
+
+
+
+//-----------------------------------------------------------
+// Page flags
+//-----------------------------------------------------------
+static inline bool mi_page_is_in_full(const mi_page_t* page) {
+  return page->flags.x.in_full;
+}
+
+static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
+  page->flags.x.in_full = in_full;
+}
+
+static inline bool mi_page_has_aligned(const mi_page_t* page) {
+  return page->flags.x.has_aligned;
+}
+
+static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
+  page->flags.x.has_aligned = has_aligned;
+}
+
+
+/* -------------------------------------------------------------------
+Encoding/Decoding the free list next pointers
+
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
+with a secret key `k1`, as `p^k1`. This prevents overwriting with known
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
+Moreover, if multiple blocks can be read as well, the attacker can
+xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
+about the pointers (and subsequently `k1`).
+
+Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
+Since these operations are not associative, the above approaches do not
+work so well any more even if the `p` can be guesstimated. For example,
+for the read case we can subtract two entries to discard the `+k1` term,
+but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
+We include the left-rotation since xor and addition are otherwise linear
+in the lowest bit. Finally, both keys are unique per page which reduces
+the re-use of keys by a large factor.
+
+We also pass a separate `null` value to be used as `NULL` or otherwise
+`(k2<<<k1)+k1` would appear (too) often as a sentinel value.
+------------------------------------------------------------------- */
+
+static inline bool mi_is_in_same_segment(const void* p, const void* q) {
+  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
+}
+
+static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  if (_mi_ptr_segment(q) != segment) return false;
+  // assume q may be invalid // return (_mi_segment_page_of(segment, p) == _mi_segment_page_of(segment, q));
+  mi_page_t* page = _mi_segment_page_of(segment, p);
+  size_t psize;
+  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
+  return (start <= (uint8_t*)q && (uint8_t*)q < start + psize);
+}
+
+static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return (shift==0 ? x : ((x << shift) | (x >> (MI_INTPTR_BITS - shift))));
+}
+static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift))));
+}
+
+static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
+  void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
+  return (p==null ? NULL : p);
+}
+
+static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) {
+  uintptr_t x = (uintptr_t)(p==NULL ? null : p);
+  return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
+  mi_track_mem_defined(block,sizeof(mi_block_t));
+  mi_block_t* next;
+  #ifdef MI_ENCODE_FREELIST
+  next = (mi_block_t*)mi_ptr_decode(null, mi_atomic_load_relaxed((_Atomic(mi_encoded_t)*)&block->next), keys);
+  #else
+  MI_UNUSED(keys); MI_UNUSED(null);
+  next = (mi_block_t*)mi_atomic_load_relaxed((_Atomic(mi_encoded_t)*)&block->next);
+  #endif
+  mi_track_mem_noaccess(block,sizeof(mi_block_t));
+  return next;
+}
+
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
+  mi_track_mem_undefined(block,sizeof(mi_block_t));
+  #ifdef MI_ENCODE_FREELIST
+  mi_atomic_store_relaxed(&block->next, mi_ptr_encode(null, next, keys));
+  #else
+  MI_UNUSED(keys); MI_UNUSED(null);
+  mi_atomic_store_relaxed(&block->next, (mi_encoded_t)next);
+  #endif
+  mi_track_mem_noaccess(block,sizeof(mi_block_t));
+}
+
+static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_t* next = mi_block_nextx(page,block,page->keys);
+  // check for free list corruption: is `next` at least in the same page?
+  // TODO: check if `next` is `page->block_size` aligned?
+  if mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next)) {
+    _mi_error_message(EFAULT, "corrupted free list entry of size %zub at %p: value 0x%zx\n", mi_page_block_size(page), block, (uintptr_t)next);
+    next = NULL;
+  }
+  return next;
+  #else
+  MI_UNUSED(page);
+  return mi_block_nextx(page,block,NULL);
+  #endif
+}
+
+static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_set_nextx(page,block,next, page->keys);
+  #else
+  MI_UNUSED(page);
+  mi_block_set_nextx(page,block,next,NULL);
+  #endif
+}
+
+
+// -------------------------------------------------------------------
+// commit mask
+// -------------------------------------------------------------------
+
+static inline void mi_commit_mask_create_empty(mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    cm->mask[i] = 0;
+  }
+}
+
+static inline void mi_commit_mask_create_full(mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    cm->mask[i] = ~((size_t)0);
+  }
+}
+
+static inline bool mi_commit_mask_is_empty(const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if (cm->mask[i] != 0) return false;
+  }
+  return true;
+}
+
+static inline bool mi_commit_mask_is_full(const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if (cm->mask[i] != ~((size_t)0)) return false;
+  }
+  return true;
+}
+
+// defined in `segment.c`:
+size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total);
+size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx);
+
+#define mi_commit_mask_foreach(cm,idx,count) \
+  idx = 0; \
+  while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) {
+
+#define mi_commit_mask_foreach_end() \
+    idx += count; \
+  }
+
+
+
+/* -----------------------------------------------------------
+  memory id's
+----------------------------------------------------------- */
+
+static inline mi_memid_t _mi_memid_create(mi_memkind_t memkind) {
+  mi_memid_t memid;
+  _mi_memzero_var(memid);
+  memid.memkind = memkind;
+  return memid;
+}
+
+static inline mi_memid_t _mi_memid_none(void) {
+  return _mi_memid_create(MI_MEM_NONE);
+}
+
+static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
+  memid.initially_committed = committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_large;
+  return memid;
+}
+
+
+// -------------------------------------------------------------------
+// Fast "random" shuffle
+// -------------------------------------------------------------------
+
+static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
+  if (x==0) { x = 17; }   // ensure we don't get stuck in generating zeros
+#if (MI_INTPTR_SIZE==8)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+  x ^= x >> 30;
+  x *= 0xbf58476d1ce4e5b9UL;
+  x ^= x >> 27;
+  x *= 0x94d049bb133111ebUL;
+  x ^= x >> 31;
+#elif (MI_INTPTR_SIZE==4)
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+  x ^= x >> 16;
+  x *= 0x7feb352dUL;
+  x ^= x >> 15;
+  x *= 0x846ca68bUL;
+  x ^= x >> 16;
+#endif
+  return x;
+}
+
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+size_t _mi_os_numa_node_count_get(void);
+
+extern _Atomic(size_t) _mi_numa_node_count;
+static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
+  else return _mi_os_numa_node_get(tld);
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
+  if mi_likely(count > 0) { return count; }
+  else return _mi_os_numa_node_count_get();
+}
+
+
+
+// -----------------------------------------------------------------------
+// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
+// -----------------------------------------------------------------------
+
+#if defined(__GNUC__)
+
+#include <limits.h>       // LONG_MAX
+#define MI_HAVE_FAST_BITSCAN
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (INTPTR_MAX == LONG_MAX)
+  return __builtin_clzl(x);
+#else
+  return __builtin_clzll(x);
+#endif
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (INTPTR_MAX == LONG_MAX)
+  return __builtin_ctzl(x);
+#else
+  return __builtin_ctzll(x);
+#endif
+}
+
+#elif defined(_MSC_VER)
+
+#include <limits.h>       // LONG_MAX
+#include <intrin.h>       // BitScanReverse64
+#define MI_HAVE_FAST_BITSCAN
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+  unsigned long idx;
+#if (INTPTR_MAX == LONG_MAX)
+  _BitScanReverse(&idx, x);
+#else
+  _BitScanReverse64(&idx, x);
+#endif
+  return ((MI_INTPTR_BITS - 1) - idx);
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+  unsigned long idx;
+#if (INTPTR_MAX == LONG_MAX)
+  _BitScanForward(&idx, x);
+#else
+  _BitScanForward64(&idx, x);
+#endif
+  return idx;
+}
+
+#else
+static inline size_t mi_ctz32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const unsigned char debruijn[32] = {
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+  };
+  if (x==0) return 32;
+  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
+}
+static inline size_t mi_clz32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
+    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
+  };
+  if (x==0) return 32;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
+}
+
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (MI_INTPTR_BITS <= 32)
+  return mi_clz32((uint32_t)x);
+#else
+  size_t count = mi_clz32((uint32_t)(x >> 32));
+  if (count < 32) return count;
+  return (32 + mi_clz32((uint32_t)x));
+#endif
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (MI_INTPTR_BITS <= 32)
+  return mi_ctz32((uint32_t)x);
+#else
+  size_t count = mi_ctz32((uint32_t)x);
+  if (count < 32) return count;
+  return (32 + mi_ctz32((uint32_t)(x>>32)));
+#endif
+}
+
+#endif
+
+// "bit scan reverse": Return index of the highest bit (or MI_INTPTR_BITS if `x` is zero)
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? MI_INTPTR_BITS : MI_INTPTR_BITS - 1 - mi_clz(x));
+}
+
+
+// ---------------------------------------------------------------------------------
+// Provide our own `_mi_memcpy` for potential performance optimizations.
+//
+// For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if
+// we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support
+// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253.
+// ---------------------------------------------------------------------------------
+
+#if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
+#include <intrin.h>
+extern bool _mi_cpu_has_fsrm;
+static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
+  if (_mi_cpu_has_fsrm) {
+    __movsb((unsigned char*)dst, (const unsigned char*)src, n);
+  }
+  else {
+    memcpy(dst, src, n);
+  }
+}
+static inline void _mi_memzero(void* dst, size_t n) {
+  if (_mi_cpu_has_fsrm) {
+    __stosb((unsigned char*)dst, 0, n);
+  }
+  else {
+    memset(dst, 0, n);
+  }
+}
+#else
+static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
+  memcpy(dst, src, n);
+}
+static inline void _mi_memzero(void* dst, size_t n) {
+  memset(dst, 0, n);
+}
+#endif
+
+// -------------------------------------------------------------------------------
+// The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned
+// This is used for example in `mi_realloc`.
+// -------------------------------------------------------------------------------
+
+#if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)
+// On GCC/CLang we provide a hint that the pointers are word aligned.
+static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
+  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
+  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
+  const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE);
+  _mi_memcpy(adst, asrc, n);
+}
+
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
+  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
+  _mi_memzero(adst, n);
+}
+#else
+// Default fallback on `_mi_memcpy`
+static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
+  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
+  _mi_memcpy(dst, src, n);
+}
+
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
+  _mi_memzero(dst, n);
+}
+#endif
+
+
+#endif
diff --git a/Include/internal/mimalloc/mimalloc/prim.h b/Include/internal/mimalloc/mimalloc/prim.h
new file mode 100644
index 0000000000000000000000000000000000000000..322ab29e6b41c24bfaaa94c6d5af103f71bfc285
--- /dev/null
+++ b/Include/internal/mimalloc/mimalloc/prim.h
@@ -0,0 +1,329 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_PRIM_H
+#define MIMALLOC_PRIM_H
+
+
+// --------------------------------------------------------------------------
+// This file specifies the primitive portability API.
+// Each OS/host needs to implement these primitives, see `src/prim`
+// for implementations on Window, macOS, WASI, and Linux/Unix.
+//
+// note: on all primitive functions, we always have result parameters != NUL, and:
+//  addr != NULL and page aligned
+//  size > 0     and page aligned
+//  return value is an error code an int where 0 is success.
+// --------------------------------------------------------------------------
+
+// OS memory configuration
+typedef struct mi_os_mem_config_s {
+  size_t  page_size;            // 4KiB
+  size_t  large_page_size;      // 2MiB
+  size_t  alloc_granularity;    // smallest allocation size (on Windows 64KiB)
+  bool    has_overcommit;       // can we reserve more memory than can be actually committed?
+  bool    must_free_whole;      // must allocated blocks be freed as a whole (false for mmap, true for VirtualAlloc)
+  bool    has_virtual_reserve;  // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
+} mi_os_mem_config_t;
+
+// Initialize
+void _mi_prim_mem_init( mi_os_mem_config_t* config );
+
+// Free OS memory
+int _mi_prim_free(void* addr, size_t size );
+
+// Allocate OS memory. Return NULL on error.
+// The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
+// If `commit` is false, the virtual memory range only needs to be reserved (with no access)
+// which will later be committed explicitly using `_mi_prim_commit`.
+// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// pre: !commit => !allow_large
+//      try_alignment >= _mi_os_page_size() and a power of 2
+int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
+
+// Commit memory. Returns error code or 0 on success.
+// For example, on Linux this would make the memory PROT_READ|PROT_WRITE.
+// `is_zero` is set to true if the memory was zero initialized (e.g. on Windows)
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
+
+// Decommit memory. Returns error code or 0 on success. The `needs_recommit` result is true
+// if the memory would need to be re-committed. For example, on Windows this is always true,
+// but on Linux we could use MADV_DONTNEED to decommit which does not need a recommit.
+// pre: needs_recommit != NULL
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);
+
+// Reset memory. The range keeps being accessible but the content might be reset.
+// Returns error code or 0 on success.
+int _mi_prim_reset(void* addr, size_t size);
+
+// Protect memory. Returns error code or 0 on success.
+int _mi_prim_protect(void* addr, size_t size, bool protect);
+
+// Allocate huge (1GiB) pages possibly associated with a NUMA node.
+// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// pre: size > 0  and a multiple of 1GiB.
+//      numa_node is either negative (don't care), or a numa node number.
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr);
+
+// Return the current NUMA node
+size_t _mi_prim_numa_node(void);
+
+// Return the number of logical NUMA nodes
+size_t _mi_prim_numa_node_count(void);
+
+// Clock ticks
+mi_msecs_t _mi_prim_clock_now(void);
+
+// Return process information (only for statistics)
+typedef struct mi_process_info_s {
+  mi_msecs_t  elapsed;
+  mi_msecs_t  utime;
+  mi_msecs_t  stime;
+  size_t      current_rss;
+  size_t      peak_rss;
+  size_t      current_commit;
+  size_t      peak_commit;
+  size_t      page_faults;
+} mi_process_info_t;
+
+void _mi_prim_process_info(mi_process_info_t* pinfo);
+
+// Default stderr output. (only for warnings etc. with verbose enabled)
+// msg != NULL && _mi_strlen(msg) > 0
+void _mi_prim_out_stderr( const char* msg );
+
+// Get an environment variable. (only for options)
+// name != NULL, result != NULL, result_size >= 64
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size);
+
+
+// Fill a buffer with strong randomness; return `false` on error or if
+// there is no strong randomization available.
+bool _mi_prim_random_buf(void* buf, size_t buf_len);
+
+// Called on the first thread start, and should ensure `_mi_thread_done` is called on thread termination.
+void _mi_prim_thread_init_auto_done(void);
+
+// Called on process exit and may take action to clean up resources associated with the thread auto done.
+void _mi_prim_thread_done_auto_done(void);
+
+// Called when the default heap for a thread changes
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
+
+
+//-------------------------------------------------------------------
+// Thread id: `_mi_prim_thread_id()`
+//
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms as
+// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
+// We only require _mi_prim_thread_id() to return a unique id
+// for each thread (unequal to zero).
+//-------------------------------------------------------------------
+
+// defined in `init.c`; do not use these directly
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+
+#ifdef MI_PRIM_THREAD_ID
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  return MI_PRIM_THREAD_ID();
+}
+
+#elif defined(_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
+// both the OS and libc implementation so we use specific tests for each main platform.
+// If you test on another platform and it works please send a PR :-)
+// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
+#elif defined(__GNUC__) && ( \
+           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
+        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
+        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
+        || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+        || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+      )
+
+static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    res = tcb[slot];
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    res = tcb[slot];
+  #endif
+  return res;
+}
+
+// setting a tls slot is only used on macOS for now
+static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    tcb[slot] = value;
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    tcb[slot] = value;
+  #endif
+}
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  #if defined(__BIONIC__)
+    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
+    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
+    return (uintptr_t)mi_prim_tls_slot(1);
+  #else
+    // in all our other targets, slot 0 is the thread id
+    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
+    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
+    return (uintptr_t)mi_prim_tls_slot(0);
+  #endif
+}
+
+#else
+
+// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&_mi_heap_default;
+}
+
+#endif
+
+
+
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_prim_get_default_heap()`
+This is inlined here as it is on the fast path for allocation functions.
+
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks).
+
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+           loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
+------------------------------------------------------------------------------------------- */
+
+static inline mi_heap_t* mi_prim_get_default_heap(void);
+
+#if defined(MI_MALLOC_OVERRIDE)
+#if defined(__APPLE__) // macOS
+  #define MI_TLS_SLOT               89  // seems unused?
+  // #define MI_TLS_RECURSE_GUARD 1
+  // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+  // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#elif defined(__OpenBSD__)
+  // use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
+  // see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+  #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)
+  // #elif defined(__DragonFly__)
+  // #warning "mimalloc is not working correctly on DragonFly yet."
+  // #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#elif defined(__ANDROID__)
+  // See issue #381
+  #define MI_TLS_PTHREAD
+#endif
+#endif
+
+
+#if defined(MI_TLS_SLOT)
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);
+  if mi_unlikely(heap == NULL) {
+    #ifdef __GNUC__
+    __asm(""); // prevent conditional load of the address of _mi_heap_empty
+    #endif
+    heap = (mi_heap_t*)&_mi_heap_empty;
+  }
+  return heap;
+}
+
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+
+static inline mi_heap_t** mi_prim_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) return NULL;
+  #endif
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
+}
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t** pheap = mi_prim_tls_pthread_heap_slot();
+  if mi_unlikely(pheap == NULL) return _mi_heap_main_get();
+  mi_heap_t* heap = *pheap;
+  if mi_unlikely(heap == NULL) return (mi_heap_t*)&_mi_heap_empty;
+  return heap;
+}
+
+#elif defined(MI_TLS_PTHREAD)
+
+extern pthread_key_t _mi_heap_default_key;
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+}
+
+#else // default using a thread local variable; used on most platforms.
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  #if defined(MI_TLS_RECURSE_GUARD)
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
+  #endif
+  return _mi_heap_default;
+}
+
+#endif  // mi_prim_get_default_heap()
+
+
+
+#endif  // MIMALLOC_PRIM_H
diff --git a/Include/internal/mimalloc/mimalloc/track.h b/Include/internal/mimalloc/mimalloc/track.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa1a048d846a9cfcb42bea37813be445847bf344
--- /dev/null
+++ b/Include/internal/mimalloc/mimalloc/track.h
@@ -0,0 +1,147 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TRACK_H
+#define MIMALLOC_TRACK_H
+
+/* ------------------------------------------------------------------------------------------------------
+Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers.
+These can be defined for tracking allocation:
+
+  #define mi_track_malloc_size(p,reqsize,size,zero)
+  #define mi_track_free_size(p,_size)
+
+The macros are set up such that the size passed to `mi_track_free_size`
+always matches the size of `mi_track_malloc_size`. (currently, `size == mi_usable_size(p)`).
+The `reqsize` is what the user requested, and `size >= reqsize`.
+The `size` is either byte precise (and `size==reqsize`) if `MI_PADDING` is enabled,
+or otherwise it is the usable block size which may be larger than the original request.
+Use `_mi_block_size_of(void* p)` to get the full block size that was allocated (including padding etc).
+The `zero` parameter is `true` if the allocated block is zero initialized.
+
+Optional:
+
+  #define mi_track_align(p,alignedp,offset,size)
+  #define mi_track_resize(p,oldsize,newsize)
+  #define mi_track_init()
+
+The `mi_track_align` is called right after a `mi_track_malloc` for aligned pointers in a block.
+The corresponding `mi_track_free` still uses the block start pointer and original size (corresponding to the `mi_track_malloc`).
+The `mi_track_resize` is currently unused but could be called on reallocations within a block.
+`mi_track_init` is called at program start.
+
+The following macros are for tools like asan and valgrind to track whether memory is
+defined, undefined, or not accessible at all:
+
+  #define mi_track_mem_defined(p,size)
+  #define mi_track_mem_undefined(p,size)
+  #define mi_track_mem_noaccess(p,size)
+
+-------------------------------------------------------------------------------------------------------*/
+
+#if MI_TRACK_VALGRIND
+// valgrind tool
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 1           // track free of individual blocks on heap_destroy
+#define MI_TRACK_TOOL         "valgrind"
+
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+
+#define mi_track_malloc_size(p,reqsize,size,zero) VALGRIND_MALLOCLIKE_BLOCK(p,size,MI_PADDING_SIZE /*red zone*/,zero)
+#define mi_track_free_size(p,_size)               VALGRIND_FREELIKE_BLOCK(p,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_resize(p,oldsize,newsize)        VALGRIND_RESIZEINPLACE_BLOCK(p,oldsize,newsize,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_mem_defined(p,size)              VALGRIND_MAKE_MEM_DEFINED(p,size)
+#define mi_track_mem_undefined(p,size)            VALGRIND_MAKE_MEM_UNDEFINED(p,size)
+#define mi_track_mem_noaccess(p,size)             VALGRIND_MAKE_MEM_NOACCESS(p,size)
+
+#elif MI_TRACK_ASAN
+// address sanitizer
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 0
+#define MI_TRACK_TOOL         "asan"
+
+#include <sanitizer/asan_interface.h>
+
+#define mi_track_malloc_size(p,reqsize,size,zero) ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_free_size(p,size)                ASAN_POISON_MEMORY_REGION(p,size)
+#define mi_track_mem_defined(p,size)              ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_undefined(p,size)            ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_noaccess(p,size)             ASAN_POISON_MEMORY_REGION(p,size)
+
+#elif MI_TRACK_ETW
+// windows event tracing
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 1
+#define MI_TRACK_TOOL         "ETW"
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include "../src/prim/windows/etw.h"
+
+#define mi_track_init()                           EventRegistermicrosoft_windows_mimalloc();
+#define mi_track_malloc_size(p,reqsize,size,zero) EventWriteETW_MI_ALLOC((UINT64)(p), size)
+#define mi_track_free_size(p,size)                EventWriteETW_MI_FREE((UINT64)(p), size)
+
+#else
+// no tracking
+
+#define MI_TRACK_ENABLED      0
+#define MI_TRACK_HEAP_DESTROY 0
+#define MI_TRACK_TOOL         "none"
+
+#define mi_track_malloc_size(p,reqsize,size,zero)
+#define mi_track_free_size(p,_size)
+
+#endif
+
+// -------------------
+// Utility definitions
+
+#ifndef mi_track_resize
+#define mi_track_resize(p,oldsize,newsize)      mi_track_free_size(p,oldsize); mi_track_malloc(p,newsize,false)
+#endif
+
+#ifndef mi_track_align
+#define mi_track_align(p,alignedp,offset,size)  mi_track_mem_noaccess(p,offset)
+#endif
+
+#ifndef mi_track_init
+#define mi_track_init()
+#endif
+
+#ifndef mi_track_mem_defined
+#define mi_track_mem_defined(p,size)
+#endif
+
+#ifndef mi_track_mem_undefined
+#define mi_track_mem_undefined(p,size)
+#endif
+
+#ifndef mi_track_mem_noaccess
+#define mi_track_mem_noaccess(p,size)
+#endif
+
+
+#if MI_PADDING
+#define mi_track_malloc(p,reqsize,zero) \
+  if ((p)!=NULL) { \
+    mi_assert_internal(mi_usable_size(p)==(reqsize)); \
+    mi_track_malloc_size(p,reqsize,reqsize,zero); \
+  }
+#else
+#define mi_track_malloc(p,reqsize,zero) \
+  if ((p)!=NULL) { \
+    mi_assert_internal(mi_usable_size(p)>=(reqsize)); \
+    mi_track_malloc_size(p,reqsize,mi_usable_size(p),zero); \
+  }
+#endif
+
+#endif
diff --git a/Include/internal/mimalloc/mimalloc/types.h b/Include/internal/mimalloc/mimalloc/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..70c600e920bad198bcf8810ea068c59471433b9f
--- /dev/null
+++ b/Include/internal/mimalloc/mimalloc/types.h
@@ -0,0 +1,721 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TYPES_H
+#define MIMALLOC_TYPES_H
+
+// --------------------------------------------------------------------------
+// This file contains the main type definitions for mimalloc:
+// mi_heap_t      : all data for a thread-local heap, contains
+//                  lists of all managed heap pages.
+// mi_segment_t   : a larger chunk of memory (32GiB) from where pages
+//                  are allocated.
+// mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
+//                  where objects are allocated.
+// --------------------------------------------------------------------------
+
+
+#include <stddef.h>   // ptrdiff_t
+#include <stdint.h>   // uintptr_t, uint16_t, etc
+#include "atomic.h"   // _Atomic
+
+#ifdef _MSC_VER
+#pragma warning(disable:4214) // bitfield is not int
+#endif
+
+// Minimal alignment necessary. On most platforms 16 bytes are needed
+// due to SSE registers for example. This must be at least `sizeof(void*)`
+#ifndef MI_MAX_ALIGN_SIZE
+#define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
+#endif
+
+#define MI_CACHE_LINE          64
+#if defined(_MSC_VER)
+#pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
+#pragma warning(disable:26812)  // unscoped enum warning
+#define mi_decl_noinline        __declspec(noinline)
+#define mi_decl_thread          __declspec(thread)
+#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
+#elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
+#define mi_decl_noinline        __attribute__((noinline))
+#define mi_decl_thread          __thread
+#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
+#else
+#define mi_decl_noinline
+#define mi_decl_thread          __thread        // hope for the best :-)
+#define mi_decl_cache_align
+#endif
+
+// ------------------------------------------------------
+// Variants
+// ------------------------------------------------------
+
+// Define NDEBUG in the release version to disable assertions.
+// #define NDEBUG
+
+// Define MI_TRACK_<tool> to enable tracking support
+// #define MI_TRACK_VALGRIND 1
+// #define MI_TRACK_ASAN     1
+// #define MI_TRACK_ETW      1
+
+// Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
+// #define MI_STAT 1
+
+// Define MI_SECURE to enable security mitigations
+// #define MI_SECURE 1  // guard page around metadata
+// #define MI_SECURE 2  // guard page around each mimalloc page
+// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+
+#if !defined(MI_SECURE)
+#define MI_SECURE 0
+#endif
+
+// Define MI_DEBUG for debug mode
+// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
+// #define MI_DEBUG 2  // + internal assertion checks
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
+#if !defined(MI_DEBUG)
+#if !defined(NDEBUG) || defined(_DEBUG)
+#define MI_DEBUG 2
+#else
+#define MI_DEBUG 0
+#endif
+#endif
+
+// Reserve extra padding at the end of each block to be more resilient against heap block overflows.
+// The padding can detect buffer overflow on free.
+#if !defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1 || (MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_TRACK_ETW))
+#define MI_PADDING  1
+#endif
+
+// Check padding bytes; allows byte-precise buffer overflow detection
+#if !defined(MI_PADDING_CHECK) && MI_PADDING && (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_PADDING_CHECK 1
+#endif
+
+
+// Encoded free lists allow detection of corrupted free lists
+// and can detect buffer overflows, modify after free, and double `free`s.
+#if (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_ENCODE_FREELIST  1
+#endif
+
+
+// We used to abandon huge pages but to eagerly deallocate if freed from another thread,
+// but that makes it not possible to visit them during a heap walk or include them in a
+// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks if freed from
+// another thread so most memory is available until it gets properly freed by the owning thread.
+// #define MI_HUGE_PAGE_ABANDON 1
+
+
+// ------------------------------------------------------
+// Platform specific values
+// ------------------------------------------------------
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == INT32_MAX
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+# define MI_ZI(x)  x##LL
+#else
+# define MI_ZU(x)  x##UL
+# define MI_ZI(x)  x##L
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
+
+
+// ------------------------------------------------------
+// Main internal data-structures
+// ------------------------------------------------------
+
+// Main tuning parameters for segment and page sizes
+// Sizes for 64-bit (usually divide by two for 32-bit)
+#define MI_SEGMENT_SLICE_SHIFT            (13 + MI_INTPTR_SHIFT)         // 64KiB  (32KiB on 32-bit)
+
+#if MI_INTPTR_SIZE > 4
+#define MI_SEGMENT_SHIFT                  ( 9 + MI_SEGMENT_SLICE_SHIFT)  // 32MiB
+#else
+#define MI_SEGMENT_SHIFT                  ( 7 + MI_SEGMENT_SLICE_SHIFT)  // 4MiB on 32-bit
+#endif
+
+#define MI_SMALL_PAGE_SHIFT               (MI_SEGMENT_SLICE_SHIFT)       // 64KiB
+#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)     // 512KiB
+
+
+// Derived constants
+#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_ALIGN                  MI_SEGMENT_SIZE
+#define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
+#define MI_SEGMENT_SLICE_SIZE             (MI_ZU(1)<< MI_SEGMENT_SLICE_SHIFT)
+#define MI_SLICES_PER_SEGMENT             (MI_SEGMENT_SIZE / MI_SEGMENT_SLICE_SIZE) // 1024
+
+#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
+
+#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 8KiB on 64-bit
+#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB on 64-bit
+#define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 32MiB on 64-bit
+#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
+
+// Maximum number of size classes. (spaced exponentially in 12.5% increments)
+#define MI_BIN_HUGE  (73U)
+
+#if (MI_MEDIUM_OBJ_WSIZE_MAX >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
+
+// Maximum slice offset (15)
+#define MI_MAX_SLICE_OFFSET               ((MI_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
+
+// Used as a special value to encode block sizes in 32 bits.
+#define MI_HUGE_BLOCK_SIZE                ((uint32_t)(2*MI_GiB))
+
+// blocks up to this size are always allocated aligned
+#define MI_MAX_ALIGN_GUARANTEE            (8*MI_MAX_ALIGN_SIZE)
+
+// Alignments over MI_ALIGNMENT_MAX are allocated in dedicated huge page segments
+#define MI_ALIGNMENT_MAX                  (MI_SEGMENT_SIZE >> 1)
+
+
+// ------------------------------------------------------
+// Mimalloc pages contain allocated blocks
+// ------------------------------------------------------
+
+// The free lists use encoded next fields
+// (Only actually encodes when MI_ENCODED_FREELIST is defined.)
+typedef uintptr_t  mi_encoded_t;
+
+// thread id's
+typedef size_t     mi_threadid_t;
+
+// free lists contain blocks
+typedef struct mi_block_s {
+  _Atomic(mi_encoded_t) next;
+} mi_block_t;
+
+
+// The delayed flags are used for efficient multi-threaded free-ing
+typedef enum mi_delayed_e {
+  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
+  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
+  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
+  MI_NEVER_DELAYED_FREE = 3  // sticky, only resets on page reclaim
+} mi_delayed_t;
+
+
+// The `in_full` and `has_aligned` page flags are put in a union to efficiently
+// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+#if !MI_TSAN
+typedef union mi_page_flags_s {
+  uint8_t full_aligned;
+  struct {
+    uint8_t in_full : 1;
+    uint8_t has_aligned : 1;
+  } x;
+} mi_page_flags_t;
+#else
+// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
+typedef union mi_page_flags_s {
+  uint16_t full_aligned;
+  struct {
+    uint8_t in_full;
+    uint8_t has_aligned;
+  } x;
+} mi_page_flags_t;
+#endif
+
+// Thread free list.
+// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+typedef uintptr_t mi_thread_free_t;
+
+// A page contains blocks of one specific size (`block_size`).
+// Each page has three list of free blocks:
+// `free` for blocks that can be allocated,
+// `local_free` for freed blocks that are not yet available to `mi_malloc`
+// `thread_free` for freed blocks by other threads
+// The `local_free` and `thread_free` lists are migrated to the `free` list
+// when it is exhausted. The separate `local_free` list is necessary to
+// implement a monotonic heartbeat. The `thread_free` list is needed for
+// avoiding atomic operations in the common case.
+//
+//
+// `used - |thread_free|` == actual blocks that are in use (alive)
+// `used - |thread_free| + |free| + |local_free| == capacity`
+//
+// We don't count `freed` (as |free|) but use `used` to reduce
+// the number of memory accesses in the `mi_page_all_free` function(s).
+//
+// Notes:
+// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`)
+// - Using `uint16_t` does not seem to slow things down
+// - The size is 8 words on 64-bit which helps the page index calculations
+//   (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10
+//    and 12 are still good for address calculation)
+// - To limit the structure size, the `xblock_size` is 32-bits only; for
+//   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
+// - `thread_free` uses the bottom bits as a delayed-free flags to optimize
+//   concurrent frees where only the first concurrent free adds to the owning
+//   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
+//   The invariant is that no-delayed-free is only set if there is
+//   at least one block that will be added, or as already been added, to
+//   the owning heap `thread_delayed_free` list. This guarantees that pages
+//   will be freed correctly even if only other threads free blocks.
+typedef struct mi_page_s {
+  // "owned" by the segment
+  uint32_t              slice_count;       // slices in this page (0 if not a page)
+  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
+  uint8_t               is_committed : 1;  // `true` if the page virtual memory is committed
+  uint8_t               is_zero_init : 1;  // `true` if the page was initially zero initialized
+  uint8_t               use_qsbr : 1;      // delay page freeing using qsbr
+  uint8_t               tag : 4;           // tag from the owning heap
+  uint8_t               debug_offset;      // number of bytes to preserve when filling freed or uninitialized memory
+
+  // layout like this to optimize access in `mi_malloc` and `mi_free`
+  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
+  uint16_t              reserved;          // number of blocks reserved in memory
+  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
+  uint8_t               free_is_zero : 1;  // `true` if the blocks in the free list are zero initialized
+  uint8_t               retire_expire : 7; // expiration count for retired blocks
+
+  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
+  uint32_t              used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
+  uint32_t              xblock_size;       // size available in each block (always `>0`)
+  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+
+  #if (MI_ENCODE_FREELIST || MI_PADDING)
+  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
+  #endif
+
+  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
+  _Atomic(uintptr_t)        xheap;
+
+  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
+  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
+
+#ifdef Py_GIL_DISABLED
+  struct llist_node     qsbr_node;
+  uint64_t              qsbr_goal;
+#endif
+
+  // 64-bit 9 words, 32-bit 12 words, (+2 for secure)
+  #if MI_INTPTR_SIZE==8 && !defined(Py_GIL_DISABLED)
+  uintptr_t padding[1];
+  #endif
+} mi_page_t;
+
+
+
+// ------------------------------------------------------
+// Mimalloc segments contain mimalloc pages
+// ------------------------------------------------------
+
+typedef enum mi_page_kind_e {
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
+  MI_PAGE_MEDIUM,   // medium blocks go into medium pages inside a segment
+  MI_PAGE_LARGE,    // larger blocks go into a page of just one block
+  MI_PAGE_HUGE,     // huge blocks (> 16 MiB) are put into a single page in a single segment.
+} mi_page_kind_t;
+
+typedef enum mi_segment_kind_e {
+  MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside.
+  MI_SEGMENT_HUGE,   // > MI_LARGE_SIZE_MAX segment with just one huge page inside.
+} mi_segment_kind_t;
+
+// ------------------------------------------------------
+// A segment holds a commit mask where a bit is set if
+// the corresponding MI_COMMIT_SIZE area is committed.
+// The MI_COMMIT_SIZE must be a multiple of the slice
+// size. If it is equal we have the most fine grained
+// decommit (but setting it higher can be more efficient).
+// The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will
+// be committed in one go which can be set higher than
+// MI_COMMIT_SIZE for efficiency (while the decommit mask
+// is still tracked in fine-grained MI_COMMIT_SIZE chunks)
+// ------------------------------------------------------
+
+#define MI_MINIMAL_COMMIT_SIZE      (1*MI_SEGMENT_SLICE_SIZE)
+#define MI_COMMIT_SIZE              (MI_SEGMENT_SLICE_SIZE)              // 64KiB
+#define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)
+#define MI_COMMIT_MASK_FIELD_BITS    MI_SIZE_BITS
+#define MI_COMMIT_MASK_FIELD_COUNT  (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS)
+
+#if (MI_COMMIT_MASK_BITS != (MI_COMMIT_MASK_FIELD_COUNT * MI_COMMIT_MASK_FIELD_BITS))
+#error "the segment size must be exactly divisible by the (commit size * size_t bits)"
+#endif
+
+typedef struct mi_commit_mask_s {
+  size_t mask[MI_COMMIT_MASK_FIELD_COUNT];
+} mi_commit_mask_t;
+
+typedef mi_page_t  mi_slice_t;
+typedef int64_t    mi_msecs_t;
+
+
+// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
+typedef enum mi_memkind_e {
+  MI_MEM_NONE,      // not allocated
+  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
+  MI_MEM_OS,        // allocated from the OS
+  MI_MEM_OS_HUGE,   // allocated as huge os pages
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
+  MI_MEM_ARENA      // allocated from an arena (the usual case)
+} mi_memkind_t;
+
+static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
+  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
+}
+
+typedef struct mi_memid_os_info {
+  void*         base;               // actual base address of the block (used for offset aligned allocations)
+  size_t        alignment;          // alignment at allocation
+} mi_memid_os_info_t;
+
+typedef struct mi_memid_arena_info {
+  size_t        block_index;        // index in the arena
+  mi_arena_id_t id;                 // arena id (>= 1)
+  bool          is_exclusive;       // the arena can only be used for specific arena allocations
+} mi_memid_arena_info_t;
+
+typedef struct mi_memid_s {
+  union {
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+  } mem;
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
+  mi_memkind_t  memkind;
+} mi_memid_t;
+
+
+// Segments are large allocated memory blocks (8mb on 64 bit) from
+// the OS. Inside segments we allocated fixed size _pages_ that
+// contain blocks.
+typedef struct mi_segment_s {
+  // constant fields
+  mi_memid_t        memid;              // memory id for arena allocation
+  bool              allow_decommit;
+  bool              allow_purge;
+  size_t            segment_size;
+
+  // segment fields
+  mi_msecs_t        purge_expire;
+  mi_commit_mask_t  purge_mask;
+  mi_commit_mask_t  commit_mask;
+
+  _Atomic(struct mi_segment_s*) abandoned_next;
+
+  // from here is zero initialized
+  struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
+
+  size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
+  size_t            abandoned_visits;   // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
+  size_t            used;               // count of pages in use
+  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
+
+  size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
+  size_t            segment_info_slices; // initial slices we are using segment info and possible guard pages.
+
+  // layout like this to optimize access in `mi_free`
+  mi_segment_kind_t kind;
+  size_t            slice_entries;       // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT`
+  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
+
+  mi_slice_t        slices[MI_SLICES_PER_SEGMENT+1];  // one more for huge blocks with large alignment
+} mi_segment_t;
+
+typedef uintptr_t        mi_tagged_segment_t;
+
+// Segments unowned by any thread are put in a shared pool
+typedef struct mi_abandoned_pool_s {
+  // This is a list of visited abandoned pages that were full at the time.
+  // this list migrates to `abandoned` when that becomes NULL. The use of
+  // this list reduces contention and the rate at which segments are visited.
+  mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
+
+  // The abandoned page list (tagged as it supports pop)
+  mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
+
+  // Maintain these for debug purposes (these counts may be a bit off)
+  mi_decl_cache_align _Atomic(size_t)           abandoned_count;
+  mi_decl_cache_align _Atomic(size_t)           abandoned_visited_count;
+
+  // We also maintain a count of current readers of the abandoned list
+  // in order to prevent resetting/decommitting segment memory if it might
+  // still be read.
+  mi_decl_cache_align _Atomic(size_t)           abandoned_readers; // = 0
+} mi_abandoned_pool_t;
+
+
+// ------------------------------------------------------
+// Heaps
+// Provide first-class heaps to allocate from.
+// A heap just owns a set of pages for allocation and
+// can only be allocate/reallocate from the thread that created it.
+// Freeing blocks can be done from any thread though.
+// Per thread, the segments are shared among its heaps.
+// Per thread, there is always a default heap that is
+// used for allocation; it is initialized to statically
+// point to an empty heap to avoid initialization checks
+// in the fast path.
+// ------------------------------------------------------
+
+// Thread local data
+typedef struct mi_tld_s mi_tld_t;
+
+// Pages of a certain block size are held in a queue.
+typedef struct mi_page_queue_s {
+  mi_page_t* first;
+  mi_page_t* last;
+  size_t     block_size;
+} mi_page_queue_t;
+
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+
+// Random context
+typedef struct mi_random_cxt_s {
+  uint32_t input[16];
+  uint32_t output[16];
+  int      output_available;
+  bool     weak;
+} mi_random_ctx_t;
+
+
+// In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
+#if (MI_PADDING)
+typedef struct mi_padding_s {
+  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
+  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
+} mi_padding_t;
+#define MI_PADDING_SIZE   (sizeof(mi_padding_t))
+#define MI_PADDING_WSIZE  ((MI_PADDING_SIZE + MI_INTPTR_SIZE - 1) / MI_INTPTR_SIZE)
+#else
+#define MI_PADDING_SIZE   0
+#define MI_PADDING_WSIZE  0
+#endif
+
+#define MI_PAGES_DIRECT   (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
+
+
+// A heap owns a set of pages.
+struct mi_heap_s {
+  mi_tld_t*             tld;
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+  _Atomic(mi_block_t*)  thread_delayed_free;
+  mi_threadid_t         thread_id;                           // thread this heap belongs too
+  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
+  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
+  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
+  mi_random_ctx_t       random;                              // random number context used for secure allocation
+  size_t                page_count;                          // total number of pages in the `pages` queues.
+  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
+  size_t                page_retired_max;                    // largest retired index into the `pages` array.
+  mi_heap_t*            next;                                // list of heaps per thread
+  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  uint8_t               tag;                                 // custom identifier for this heap
+  uint8_t               debug_offset;                        // number of bytes to preserve when filling freed or uninitialized memory
+  bool                  page_use_qsbr;                       // should freeing pages be delayed using QSBR
+};
+
+
+
+// ------------------------------------------------------
+// Debug
+// ------------------------------------------------------
+
+#if !defined(MI_DEBUG_UNINIT)
+#define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#if !defined(MI_DEBUG_FREED)
+#define MI_DEBUG_FREED      (0xDF)
+#endif
+#if !defined(MI_DEBUG_PADDING)
+#define MI_DEBUG_PADDING    (0xDE)
+#endif
+
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
+#endif
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
+#endif
+
+// ------------------------------------------------------
+// Statistics
+// ------------------------------------------------------
+
+#ifndef MI_STAT
+#if (MI_DEBUG>0)
+#define MI_STAT 2
+#else
+#define MI_STAT 0
+#endif
+#endif
+
+typedef struct mi_stat_count_s {
+  int64_t allocated;
+  int64_t freed;
+  int64_t peak;
+  int64_t current;
+} mi_stat_count_t;
+
+typedef struct mi_stat_counter_s {
+  int64_t total;
+  int64_t count;
+} mi_stat_counter_t;
+
+typedef struct mi_stats_s {
+  mi_stat_count_t segments;
+  mi_stat_count_t pages;
+  mi_stat_count_t reserved;
+  mi_stat_count_t committed;
+  mi_stat_count_t reset;
+  mi_stat_count_t purged;
+  mi_stat_count_t page_committed;
+  mi_stat_count_t segments_abandoned;
+  mi_stat_count_t pages_abandoned;
+  mi_stat_count_t threads;
+  mi_stat_count_t normal;
+  mi_stat_count_t huge;
+  mi_stat_count_t large;
+  mi_stat_count_t malloc;
+  mi_stat_count_t segments_cache;
+  mi_stat_counter_t pages_extended;
+  mi_stat_counter_t mmap_calls;
+  mi_stat_counter_t commit_calls;
+  mi_stat_counter_t reset_calls;
+  mi_stat_counter_t purge_calls;
+  mi_stat_counter_t page_no_retire;
+  mi_stat_counter_t searches;
+  mi_stat_counter_t normal_count;
+  mi_stat_counter_t huge_count;
+  mi_stat_counter_t large_count;
+#if MI_STAT>1
+  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
+#endif
+} mi_stats_t;
+
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+
+#if (MI_STAT)
+#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
+#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
+#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
+#else
+#define mi_stat_increase(stat,amount)         (void)0
+#define mi_stat_decrease(stat,amount)         (void)0
+#define mi_stat_counter_increase(stat,amount) (void)0
+#endif
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+
+// ------------------------------------------------------
+// Thread Local data
+// ------------------------------------------------------
+
+// A "span" is an available range of slices. The span queues keep
+// track of slice spans of at most the given `slice_count` (but more than the previous size class).
+typedef struct mi_span_queue_s {
+  mi_slice_t* first;
+  mi_slice_t* last;
+  size_t      slice_count;
+} mi_span_queue_t;
+
+#define MI_SEGMENT_BIN_MAX (35)     // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT)
+
+// OS thread local data
+typedef struct mi_os_tld_s {
+  size_t                region_idx;   // start point for next allocation
+  mi_stats_t*           stats;        // points to tld stats
+} mi_os_tld_t;
+
+
+// Segments thread local data
+typedef struct mi_segments_tld_s {
+  mi_span_queue_t     spans[MI_SEGMENT_BIN_MAX+1];  // free slice spans inside segments
+  size_t              count;        // current number of segments;
+  size_t              peak_count;   // peak number of segments
+  size_t              current_size; // current size of all segments
+  size_t              peak_size;    // peak size of all segments
+  mi_stats_t*         stats;        // points to tld stats
+  mi_os_tld_t*        os;           // points to os stats
+  mi_abandoned_pool_t* abandoned;   // pool of abandoned segments
+} mi_segments_tld_t;
+
+// Thread local data
+struct mi_tld_s {
+  unsigned long long  heartbeat;     // monotonic heartbeat count
+  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
+  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
+  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
+  mi_segments_tld_t   segments;      // segment tld
+  mi_os_tld_t         os;            // os tld
+  mi_stats_t          stats;         // statistics
+};
+
+#endif
diff --git a/Include/internal/pycore_abstract.h b/Include/internal/pycore_abstract.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cc0afac4bd5b45490d3edd8e92062bec837da8a
--- /dev/null
+++ b/Include/internal/pycore_abstract.h
@@ -0,0 +1,61 @@
+#ifndef Py_INTERNAL_ABSTRACT_H
+#define Py_INTERNAL_ABSTRACT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Fast inlined version of PyIndex_Check()
+static inline int
+_PyIndex_Check(PyObject *obj)
+{
+    PyNumberMethods *tp_as_number = Py_TYPE(obj)->tp_as_number;
+    return (tp_as_number != NULL && tp_as_number->nb_index != NULL);
+}
+
+PyObject *_PyNumber_PowerNoMod(PyObject *lhs, PyObject *rhs);
+PyObject *_PyNumber_InPlacePowerNoMod(PyObject *lhs, PyObject *rhs);
+
+extern int _PyObject_HasLen(PyObject *o);
+
+/* === Sequence protocol ================================================ */
+
+#define PY_ITERSEARCH_COUNT    1
+#define PY_ITERSEARCH_INDEX    2
+#define PY_ITERSEARCH_CONTAINS 3
+
+/* Iterate over seq.
+
+   Result depends on the operation:
+
+   PY_ITERSEARCH_COUNT:  return # of times obj appears in seq; -1 if
+     error.
+   PY_ITERSEARCH_INDEX:  return 0-based index of first occurrence of
+     obj in seq; set ValueError and return -1 if none found;
+     also return -1 on error.
+   PY_ITERSEARCH_CONTAINS:  return 1 if obj in seq, else 0; -1 on
+     error. */
+extern Py_ssize_t _PySequence_IterSearch(PyObject *seq,
+                                         PyObject *obj, int operation);
+
+/* === Mapping protocol ================================================= */
+
+extern int _PyObject_RealIsInstance(PyObject *inst, PyObject *cls);
+
+extern int _PyObject_RealIsSubclass(PyObject *derived, PyObject *cls);
+
+// Convert Python int to Py_ssize_t. Do nothing if the argument is None.
+// Export for '_bisect' shared extension.
+PyAPI_FUNC(int) _Py_convert_optional_to_ssize_t(PyObject *, void *);
+
+// Same as PyNumber_Index() but can return an instance of a subclass of int.
+// Export for 'math' shared extension.
+PyAPI_FUNC(PyObject*) _PyNumber_Index(PyObject *o);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_ABSTRACT_H */
diff --git a/Include/internal/pycore_asdl.h b/Include/internal/pycore_asdl.h
new file mode 100644
index 0000000000000000000000000000000000000000..afeada88d13e24ceb69ebc07456964b26ba8e3c1
--- /dev/null
+++ b/Include/internal/pycore_asdl.h
@@ -0,0 +1,112 @@
+#ifndef Py_INTERNAL_ASDL_H
+#define Py_INTERNAL_ASDL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_pyarena.h"       // _PyArena_Malloc()
+
+typedef PyObject * identifier;
+typedef PyObject * string;
+typedef PyObject * object;
+typedef PyObject * constant;
+
+/* It would be nice if the code generated by asdl_c.py was completely
+   independent of Python, but it is a goal the requires too much work
+   at this stage.  So, for example, I'll represent identifiers as
+   interned Python strings.
+*/
+
+#define _ASDL_SEQ_HEAD \
+    Py_ssize_t size;   \
+    void **elements;
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+} asdl_seq;
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    void *typed_elements[1];
+} asdl_generic_seq;
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    PyObject *typed_elements[1];
+} asdl_identifier_seq;
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    int typed_elements[1];
+} asdl_int_seq;
+
+asdl_generic_seq *_Py_asdl_generic_seq_new(Py_ssize_t size, PyArena *arena);
+asdl_identifier_seq *_Py_asdl_identifier_seq_new(Py_ssize_t size, PyArena *arena);
+asdl_int_seq *_Py_asdl_int_seq_new(Py_ssize_t size, PyArena *arena);
+
+
+#define GENERATE_ASDL_SEQ_CONSTRUCTOR(NAME, TYPE) \
+asdl_ ## NAME ## _seq *_Py_asdl_ ## NAME ## _seq_new(Py_ssize_t size, PyArena *arena) \
+{ \
+    asdl_ ## NAME ## _seq *seq = NULL; \
+    size_t n; \
+    /* check size is sane */ \
+    if (size < 0 || \
+        (size && (((size_t)size - 1) > (SIZE_MAX / sizeof(void *))))) { \
+        PyErr_NoMemory(); \
+        return NULL; \
+    } \
+    n = (size ? (sizeof(TYPE *) * (size - 1)) : 0); \
+    /* check if size can be added safely */ \
+    if (n > SIZE_MAX - sizeof(asdl_ ## NAME ## _seq)) { \
+        PyErr_NoMemory(); \
+        return NULL; \
+    } \
+    n += sizeof(asdl_ ## NAME ## _seq); \
+    seq = (asdl_ ## NAME ## _seq *)_PyArena_Malloc(arena, n); \
+    if (!seq) { \
+        PyErr_NoMemory(); \
+        return NULL; \
+    } \
+    memset(seq, 0, n); \
+    seq->size = size; \
+    seq->elements = (void**)seq->typed_elements; \
+    return seq; \
+}
+
+#define asdl_seq_GET_UNTYPED(S, I) _Py_RVALUE((S)->elements[(I)])
+#define asdl_seq_GET(S, I) _Py_RVALUE((S)->typed_elements[(I)])
+#define asdl_seq_LEN(S) _Py_RVALUE(((S) == NULL ? 0 : (S)->size))
+
+#ifdef Py_DEBUG
+#  define asdl_seq_SET(S, I, V) \
+    do { \
+        Py_ssize_t _asdl_i = (I); \
+        assert((S) != NULL); \
+        assert(0 <= _asdl_i && _asdl_i < (S)->size); \
+        (S)->typed_elements[_asdl_i] = (V); \
+    } while (0)
+#else
+#  define asdl_seq_SET(S, I, V) _Py_RVALUE((S)->typed_elements[(I)] = (V))
+#endif
+
+#ifdef Py_DEBUG
+#  define asdl_seq_SET_UNTYPED(S, I, V) \
+    do { \
+        Py_ssize_t _asdl_i = (I); \
+        assert((S) != NULL); \
+        assert(0 <= _asdl_i && _asdl_i < (S)->size); \
+        (S)->elements[_asdl_i] = (V); \
+    } while (0)
+#else
+#  define asdl_seq_SET_UNTYPED(S, I, V) _Py_RVALUE((S)->elements[(I)] = (V))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_ASDL_H */
diff --git a/Include/internal/pycore_ast.h b/Include/internal/pycore_ast.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5bf1205a82be98eb6e8c96c45ff540ebc559c3b
--- /dev/null
+++ b/Include/internal/pycore_ast.h
@@ -0,0 +1,926 @@
+// File automatically generated by Parser/asdl_c.py.
+
+#ifndef Py_INTERNAL_AST_H
+#define Py_INTERNAL_AST_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_asdl.h"          // _ASDL_SEQ_HEAD
+
+typedef struct _mod *mod_ty;
+
+typedef struct _stmt *stmt_ty;
+
+typedef struct _expr *expr_ty;
+
+typedef enum _expr_context { Load=1, Store=2, Del=3 } expr_context_ty;
+
+typedef enum _boolop { And=1, Or=2 } boolop_ty;
+
+typedef enum _operator { Add=1, Sub=2, Mult=3, MatMult=4, Div=5, Mod=6, Pow=7,
+                         LShift=8, RShift=9, BitOr=10, BitXor=11, BitAnd=12,
+                         FloorDiv=13 } operator_ty;
+
+typedef enum _unaryop { Invert=1, Not=2, UAdd=3, USub=4 } unaryop_ty;
+
+typedef enum _cmpop { Eq=1, NotEq=2, Lt=3, LtE=4, Gt=5, GtE=6, Is=7, IsNot=8,
+                      In=9, NotIn=10 } cmpop_ty;
+
+typedef struct _comprehension *comprehension_ty;
+
+typedef struct _excepthandler *excepthandler_ty;
+
+typedef struct _arguments *arguments_ty;
+
+typedef struct _arg *arg_ty;
+
+typedef struct _keyword *keyword_ty;
+
+typedef struct _alias *alias_ty;
+
+typedef struct _withitem *withitem_ty;
+
+typedef struct _match_case *match_case_ty;
+
+typedef struct _pattern *pattern_ty;
+
+typedef struct _type_ignore *type_ignore_ty;
+
+typedef struct _type_param *type_param_ty;
+
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    mod_ty typed_elements[1];
+} asdl_mod_seq;
+
+asdl_mod_seq *_Py_asdl_mod_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    stmt_ty typed_elements[1];
+} asdl_stmt_seq;
+
+asdl_stmt_seq *_Py_asdl_stmt_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    expr_ty typed_elements[1];
+} asdl_expr_seq;
+
+asdl_expr_seq *_Py_asdl_expr_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    comprehension_ty typed_elements[1];
+} asdl_comprehension_seq;
+
+asdl_comprehension_seq *_Py_asdl_comprehension_seq_new(Py_ssize_t size, PyArena
+                                                       *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    excepthandler_ty typed_elements[1];
+} asdl_excepthandler_seq;
+
+asdl_excepthandler_seq *_Py_asdl_excepthandler_seq_new(Py_ssize_t size, PyArena
+                                                       *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    arguments_ty typed_elements[1];
+} asdl_arguments_seq;
+
+asdl_arguments_seq *_Py_asdl_arguments_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    arg_ty typed_elements[1];
+} asdl_arg_seq;
+
+asdl_arg_seq *_Py_asdl_arg_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    keyword_ty typed_elements[1];
+} asdl_keyword_seq;
+
+asdl_keyword_seq *_Py_asdl_keyword_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    alias_ty typed_elements[1];
+} asdl_alias_seq;
+
+asdl_alias_seq *_Py_asdl_alias_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    withitem_ty typed_elements[1];
+} asdl_withitem_seq;
+
+asdl_withitem_seq *_Py_asdl_withitem_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    match_case_ty typed_elements[1];
+} asdl_match_case_seq;
+
+asdl_match_case_seq *_Py_asdl_match_case_seq_new(Py_ssize_t size, PyArena
+                                                 *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    pattern_ty typed_elements[1];
+} asdl_pattern_seq;
+
+asdl_pattern_seq *_Py_asdl_pattern_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    type_ignore_ty typed_elements[1];
+} asdl_type_ignore_seq;
+
+asdl_type_ignore_seq *_Py_asdl_type_ignore_seq_new(Py_ssize_t size, PyArena
+                                                   *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    type_param_ty typed_elements[1];
+} asdl_type_param_seq;
+
+asdl_type_param_seq *_Py_asdl_type_param_seq_new(Py_ssize_t size, PyArena
+                                                 *arena);
+
+
+enum _mod_kind {Module_kind=1, Interactive_kind=2, Expression_kind=3,
+                 FunctionType_kind=4};
+struct _mod {
+    enum _mod_kind kind;
+    union {
+        struct {
+            asdl_stmt_seq *body;
+            asdl_type_ignore_seq *type_ignores;
+        } Module;
+
+        struct {
+            asdl_stmt_seq *body;
+        } Interactive;
+
+        struct {
+            expr_ty body;
+        } Expression;
+
+        struct {
+            asdl_expr_seq *argtypes;
+            expr_ty returns;
+        } FunctionType;
+
+    } v;
+};
+
+enum _stmt_kind {FunctionDef_kind=1, AsyncFunctionDef_kind=2, ClassDef_kind=3,
+                  Return_kind=4, Delete_kind=5, Assign_kind=6,
+                  TypeAlias_kind=7, AugAssign_kind=8, AnnAssign_kind=9,
+                  For_kind=10, AsyncFor_kind=11, While_kind=12, If_kind=13,
+                  With_kind=14, AsyncWith_kind=15, Match_kind=16,
+                  Raise_kind=17, Try_kind=18, TryStar_kind=19, Assert_kind=20,
+                  Import_kind=21, ImportFrom_kind=22, Global_kind=23,
+                  Nonlocal_kind=24, Expr_kind=25, Pass_kind=26, Break_kind=27,
+                  Continue_kind=28};
+struct _stmt {
+    enum _stmt_kind kind;
+    union {
+        struct {
+            identifier name;
+            arguments_ty args;
+            asdl_stmt_seq *body;
+            asdl_expr_seq *decorator_list;
+            expr_ty returns;
+            string type_comment;
+            asdl_type_param_seq *type_params;
+        } FunctionDef;
+
+        struct {
+            identifier name;
+            arguments_ty args;
+            asdl_stmt_seq *body;
+            asdl_expr_seq *decorator_list;
+            expr_ty returns;
+            string type_comment;
+            asdl_type_param_seq *type_params;
+        } AsyncFunctionDef;
+
+        struct {
+            identifier name;
+            asdl_expr_seq *bases;
+            asdl_keyword_seq *keywords;
+            asdl_stmt_seq *body;
+            asdl_expr_seq *decorator_list;
+            asdl_type_param_seq *type_params;
+        } ClassDef;
+
+        struct {
+            expr_ty value;
+        } Return;
+
+        struct {
+            asdl_expr_seq *targets;
+        } Delete;
+
+        struct {
+            asdl_expr_seq *targets;
+            expr_ty value;
+            string type_comment;
+        } Assign;
+
+        struct {
+            expr_ty name;
+            asdl_type_param_seq *type_params;
+            expr_ty value;
+        } TypeAlias;
+
+        struct {
+            expr_ty target;
+            operator_ty op;
+            expr_ty value;
+        } AugAssign;
+
+        struct {
+            expr_ty target;
+            expr_ty annotation;
+            expr_ty value;
+            int simple;
+        } AnnAssign;
+
+        struct {
+            expr_ty target;
+            expr_ty iter;
+            asdl_stmt_seq *body;
+            asdl_stmt_seq *orelse;
+            string type_comment;
+        } For;
+
+        struct {
+            expr_ty target;
+            expr_ty iter;
+            asdl_stmt_seq *body;
+            asdl_stmt_seq *orelse;
+            string type_comment;
+        } AsyncFor;
+
+        struct {
+            expr_ty test;
+            asdl_stmt_seq *body;
+            asdl_stmt_seq *orelse;
+        } While;
+
+        struct {
+            expr_ty test;
+            asdl_stmt_seq *body;
+            asdl_stmt_seq *orelse;
+        } If;
+
+        struct {
+            asdl_withitem_seq *items;
+            asdl_stmt_seq *body;
+            string type_comment;
+        } With;
+
+        struct {
+            asdl_withitem_seq *items;
+            asdl_stmt_seq *body;
+            string type_comment;
+        } AsyncWith;
+
+        struct {
+            expr_ty subject;
+            asdl_match_case_seq *cases;
+        } Match;
+
+        struct {
+            expr_ty exc;
+            expr_ty cause;
+        } Raise;
+
+        struct {
+            asdl_stmt_seq *body;
+            asdl_excepthandler_seq *handlers;
+            asdl_stmt_seq *orelse;
+            asdl_stmt_seq *finalbody;
+        } Try;
+
+        struct {
+            asdl_stmt_seq *body;
+            asdl_excepthandler_seq *handlers;
+            asdl_stmt_seq *orelse;
+            asdl_stmt_seq *finalbody;
+        } TryStar;
+
+        struct {
+            expr_ty test;
+            expr_ty msg;
+        } Assert;
+
+        struct {
+            asdl_alias_seq *names;
+        } Import;
+
+        struct {
+            identifier module;
+            asdl_alias_seq *names;
+            int level;
+        } ImportFrom;
+
+        struct {
+            asdl_identifier_seq *names;
+        } Global;
+
+        struct {
+            asdl_identifier_seq *names;
+        } Nonlocal;
+
+        struct {
+            expr_ty value;
+        } Expr;
+
+    } v;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+enum _expr_kind {BoolOp_kind=1, NamedExpr_kind=2, BinOp_kind=3, UnaryOp_kind=4,
+                  Lambda_kind=5, IfExp_kind=6, Dict_kind=7, Set_kind=8,
+                  ListComp_kind=9, SetComp_kind=10, DictComp_kind=11,
+                  GeneratorExp_kind=12, Await_kind=13, Yield_kind=14,
+                  YieldFrom_kind=15, Compare_kind=16, Call_kind=17,
+                  FormattedValue_kind=18, JoinedStr_kind=19, Constant_kind=20,
+                  Attribute_kind=21, Subscript_kind=22, Starred_kind=23,
+                  Name_kind=24, List_kind=25, Tuple_kind=26, Slice_kind=27};
+struct _expr {
+    enum _expr_kind kind;
+    union {
+        struct {
+            boolop_ty op;
+            asdl_expr_seq *values;
+        } BoolOp;
+
+        struct {
+            expr_ty target;
+            expr_ty value;
+        } NamedExpr;
+
+        struct {
+            expr_ty left;
+            operator_ty op;
+            expr_ty right;
+        } BinOp;
+
+        struct {
+            unaryop_ty op;
+            expr_ty operand;
+        } UnaryOp;
+
+        struct {
+            arguments_ty args;
+            expr_ty body;
+        } Lambda;
+
+        struct {
+            expr_ty test;
+            expr_ty body;
+            expr_ty orelse;
+        } IfExp;
+
+        struct {
+            asdl_expr_seq *keys;
+            asdl_expr_seq *values;
+        } Dict;
+
+        struct {
+            asdl_expr_seq *elts;
+        } Set;
+
+        struct {
+            expr_ty elt;
+            asdl_comprehension_seq *generators;
+        } ListComp;
+
+        struct {
+            expr_ty elt;
+            asdl_comprehension_seq *generators;
+        } SetComp;
+
+        struct {
+            expr_ty key;
+            expr_ty value;
+            asdl_comprehension_seq *generators;
+        } DictComp;
+
+        struct {
+            expr_ty elt;
+            asdl_comprehension_seq *generators;
+        } GeneratorExp;
+
+        struct {
+            expr_ty value;
+        } Await;
+
+        struct {
+            expr_ty value;
+        } Yield;
+
+        struct {
+            expr_ty value;
+        } YieldFrom;
+
+        struct {
+            expr_ty left;
+            asdl_int_seq *ops;
+            asdl_expr_seq *comparators;
+        } Compare;
+
+        struct {
+            expr_ty func;
+            asdl_expr_seq *args;
+            asdl_keyword_seq *keywords;
+        } Call;
+
+        struct {
+            expr_ty value;
+            int conversion;
+            expr_ty format_spec;
+        } FormattedValue;
+
+        struct {
+            asdl_expr_seq *values;
+        } JoinedStr;
+
+        struct {
+            constant value;
+            string kind;
+        } Constant;
+
+        struct {
+            expr_ty value;
+            identifier attr;
+            expr_context_ty ctx;
+        } Attribute;
+
+        struct {
+            expr_ty value;
+            expr_ty slice;
+            expr_context_ty ctx;
+        } Subscript;
+
+        struct {
+            expr_ty value;
+            expr_context_ty ctx;
+        } Starred;
+
+        struct {
+            identifier id;
+            expr_context_ty ctx;
+        } Name;
+
+        struct {
+            asdl_expr_seq *elts;
+            expr_context_ty ctx;
+        } List;
+
+        struct {
+            asdl_expr_seq *elts;
+            expr_context_ty ctx;
+        } Tuple;
+
+        struct {
+            expr_ty lower;
+            expr_ty upper;
+            expr_ty step;
+        } Slice;
+
+    } v;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+struct _comprehension {
+    expr_ty target;
+    expr_ty iter;
+    asdl_expr_seq *ifs;
+    int is_async;
+};
+
+enum _excepthandler_kind {ExceptHandler_kind=1};
+struct _excepthandler {
+    enum _excepthandler_kind kind;
+    union {
+        struct {
+            expr_ty type;
+            identifier name;
+            asdl_stmt_seq *body;
+        } ExceptHandler;
+
+    } v;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+struct _arguments {
+    asdl_arg_seq *posonlyargs;
+    asdl_arg_seq *args;
+    arg_ty vararg;
+    asdl_arg_seq *kwonlyargs;
+    asdl_expr_seq *kw_defaults;
+    arg_ty kwarg;
+    asdl_expr_seq *defaults;
+};
+
+struct _arg {
+    identifier arg;
+    expr_ty annotation;
+    string type_comment;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+struct _keyword {
+    identifier arg;
+    expr_ty value;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+struct _alias {
+    identifier name;
+    identifier asname;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+struct _withitem {
+    expr_ty context_expr;
+    expr_ty optional_vars;
+};
+
+struct _match_case {
+    pattern_ty pattern;
+    expr_ty guard;
+    asdl_stmt_seq *body;
+};
+
+enum _pattern_kind {MatchValue_kind=1, MatchSingleton_kind=2,
+                     MatchSequence_kind=3, MatchMapping_kind=4,
+                     MatchClass_kind=5, MatchStar_kind=6, MatchAs_kind=7,
+                     MatchOr_kind=8};
+struct _pattern {
+    enum _pattern_kind kind;
+    union {
+        struct {
+            expr_ty value;
+        } MatchValue;
+
+        struct {
+            constant value;
+        } MatchSingleton;
+
+        struct {
+            asdl_pattern_seq *patterns;
+        } MatchSequence;
+
+        struct {
+            asdl_expr_seq *keys;
+            asdl_pattern_seq *patterns;
+            identifier rest;
+        } MatchMapping;
+
+        struct {
+            expr_ty cls;
+            asdl_pattern_seq *patterns;
+            asdl_identifier_seq *kwd_attrs;
+            asdl_pattern_seq *kwd_patterns;
+        } MatchClass;
+
+        struct {
+            identifier name;
+        } MatchStar;
+
+        struct {
+            pattern_ty pattern;
+            identifier name;
+        } MatchAs;
+
+        struct {
+            asdl_pattern_seq *patterns;
+        } MatchOr;
+
+    } v;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+enum _type_ignore_kind {TypeIgnore_kind=1};
+struct _type_ignore {
+    enum _type_ignore_kind kind;
+    union {
+        struct {
+            int lineno;
+            string tag;
+        } TypeIgnore;
+
+    } v;
+};
+
+enum _type_param_kind {TypeVar_kind=1, ParamSpec_kind=2, TypeVarTuple_kind=3};
+struct _type_param {
+    enum _type_param_kind kind;
+    union {
+        struct {
+            identifier name;
+            expr_ty bound;
+            expr_ty default_value;
+        } TypeVar;
+
+        struct {
+            identifier name;
+            expr_ty default_value;
+        } ParamSpec;
+
+        struct {
+            identifier name;
+            expr_ty default_value;
+        } TypeVarTuple;
+
+    } v;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+
+// Note: these macros affect function definitions, not only call sites.
+mod_ty _PyAST_Module(asdl_stmt_seq * body, asdl_type_ignore_seq * type_ignores,
+                     PyArena *arena);
+mod_ty _PyAST_Interactive(asdl_stmt_seq * body, PyArena *arena);
+mod_ty _PyAST_Expression(expr_ty body, PyArena *arena);
+mod_ty _PyAST_FunctionType(asdl_expr_seq * argtypes, expr_ty returns, PyArena
+                           *arena);
+stmt_ty _PyAST_FunctionDef(identifier name, arguments_ty args, asdl_stmt_seq *
+                           body, asdl_expr_seq * decorator_list, expr_ty
+                           returns, string type_comment, asdl_type_param_seq *
+                           type_params, int lineno, int col_offset, int
+                           end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_AsyncFunctionDef(identifier name, arguments_ty args,
+                                asdl_stmt_seq * body, asdl_expr_seq *
+                                decorator_list, expr_ty returns, string
+                                type_comment, asdl_type_param_seq *
+                                type_params, int lineno, int col_offset, int
+                                end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_ClassDef(identifier name, asdl_expr_seq * bases,
+                        asdl_keyword_seq * keywords, asdl_stmt_seq * body,
+                        asdl_expr_seq * decorator_list, asdl_type_param_seq *
+                        type_params, int lineno, int col_offset, int
+                        end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Return(expr_ty value, int lineno, int col_offset, int
+                      end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Delete(asdl_expr_seq * targets, int lineno, int col_offset, int
+                      end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Assign(asdl_expr_seq * targets, expr_ty value, string
+                      type_comment, int lineno, int col_offset, int end_lineno,
+                      int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_TypeAlias(expr_ty name, asdl_type_param_seq * type_params,
+                         expr_ty value, int lineno, int col_offset, int
+                         end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_AugAssign(expr_ty target, operator_ty op, expr_ty value, int
+                         lineno, int col_offset, int end_lineno, int
+                         end_col_offset, PyArena *arena);
+stmt_ty _PyAST_AnnAssign(expr_ty target, expr_ty annotation, expr_ty value, int
+                         simple, int lineno, int col_offset, int end_lineno,
+                         int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_For(expr_ty target, expr_ty iter, asdl_stmt_seq * body,
+                   asdl_stmt_seq * orelse, string type_comment, int lineno, int
+                   col_offset, int end_lineno, int end_col_offset, PyArena
+                   *arena);
+stmt_ty _PyAST_AsyncFor(expr_ty target, expr_ty iter, asdl_stmt_seq * body,
+                        asdl_stmt_seq * orelse, string type_comment, int
+                        lineno, int col_offset, int end_lineno, int
+                        end_col_offset, PyArena *arena);
+stmt_ty _PyAST_While(expr_ty test, asdl_stmt_seq * body, asdl_stmt_seq *
+                     orelse, int lineno, int col_offset, int end_lineno, int
+                     end_col_offset, PyArena *arena);
+stmt_ty _PyAST_If(expr_ty test, asdl_stmt_seq * body, asdl_stmt_seq * orelse,
+                  int lineno, int col_offset, int end_lineno, int
+                  end_col_offset, PyArena *arena);
+stmt_ty _PyAST_With(asdl_withitem_seq * items, asdl_stmt_seq * body, string
+                    type_comment, int lineno, int col_offset, int end_lineno,
+                    int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_AsyncWith(asdl_withitem_seq * items, asdl_stmt_seq * body,
+                         string type_comment, int lineno, int col_offset, int
+                         end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Match(expr_ty subject, asdl_match_case_seq * cases, int lineno,
+                     int col_offset, int end_lineno, int end_col_offset,
+                     PyArena *arena);
+stmt_ty _PyAST_Raise(expr_ty exc, expr_ty cause, int lineno, int col_offset,
+                     int end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Try(asdl_stmt_seq * body, asdl_excepthandler_seq * handlers,
+                   asdl_stmt_seq * orelse, asdl_stmt_seq * finalbody, int
+                   lineno, int col_offset, int end_lineno, int end_col_offset,
+                   PyArena *arena);
+stmt_ty _PyAST_TryStar(asdl_stmt_seq * body, asdl_excepthandler_seq * handlers,
+                       asdl_stmt_seq * orelse, asdl_stmt_seq * finalbody, int
+                       lineno, int col_offset, int end_lineno, int
+                       end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Assert(expr_ty test, expr_ty msg, int lineno, int col_offset,
+                      int end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Import(asdl_alias_seq * names, int lineno, int col_offset, int
+                      end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_ImportFrom(identifier module, asdl_alias_seq * names, int level,
+                          int lineno, int col_offset, int end_lineno, int
+                          end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Global(asdl_identifier_seq * names, int lineno, int col_offset,
+                      int end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Nonlocal(asdl_identifier_seq * names, int lineno, int
+                        col_offset, int end_lineno, int end_col_offset, PyArena
+                        *arena);
+stmt_ty _PyAST_Expr(expr_ty value, int lineno, int col_offset, int end_lineno,
+                    int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Pass(int lineno, int col_offset, int end_lineno, int
+                    end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Break(int lineno, int col_offset, int end_lineno, int
+                     end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Continue(int lineno, int col_offset, int end_lineno, int
+                        end_col_offset, PyArena *arena);
+expr_ty _PyAST_BoolOp(boolop_ty op, asdl_expr_seq * values, int lineno, int
+                      col_offset, int end_lineno, int end_col_offset, PyArena
+                      *arena);
+expr_ty _PyAST_NamedExpr(expr_ty target, expr_ty value, int lineno, int
+                         col_offset, int end_lineno, int end_col_offset,
+                         PyArena *arena);
+expr_ty _PyAST_BinOp(expr_ty left, operator_ty op, expr_ty right, int lineno,
+                     int col_offset, int end_lineno, int end_col_offset,
+                     PyArena *arena);
+expr_ty _PyAST_UnaryOp(unaryop_ty op, expr_ty operand, int lineno, int
+                       col_offset, int end_lineno, int end_col_offset, PyArena
+                       *arena);
+expr_ty _PyAST_Lambda(arguments_ty args, expr_ty body, int lineno, int
+                      col_offset, int end_lineno, int end_col_offset, PyArena
+                      *arena);
+expr_ty _PyAST_IfExp(expr_ty test, expr_ty body, expr_ty orelse, int lineno,
+                     int col_offset, int end_lineno, int end_col_offset,
+                     PyArena *arena);
+expr_ty _PyAST_Dict(asdl_expr_seq * keys, asdl_expr_seq * values, int lineno,
+                    int col_offset, int end_lineno, int end_col_offset, PyArena
+                    *arena);
+expr_ty _PyAST_Set(asdl_expr_seq * elts, int lineno, int col_offset, int
+                   end_lineno, int end_col_offset, PyArena *arena);
+expr_ty _PyAST_ListComp(expr_ty elt, asdl_comprehension_seq * generators, int
+                        lineno, int col_offset, int end_lineno, int
+                        end_col_offset, PyArena *arena);
+expr_ty _PyAST_SetComp(expr_ty elt, asdl_comprehension_seq * generators, int
+                       lineno, int col_offset, int end_lineno, int
+                       end_col_offset, PyArena *arena);
+expr_ty _PyAST_DictComp(expr_ty key, expr_ty value, asdl_comprehension_seq *
+                        generators, int lineno, int col_offset, int end_lineno,
+                        int end_col_offset, PyArena *arena);
+expr_ty _PyAST_GeneratorExp(expr_ty elt, asdl_comprehension_seq * generators,
+                            int lineno, int col_offset, int end_lineno, int
+                            end_col_offset, PyArena *arena);
+expr_ty _PyAST_Await(expr_ty value, int lineno, int col_offset, int end_lineno,
+                     int end_col_offset, PyArena *arena);
+expr_ty _PyAST_Yield(expr_ty value, int lineno, int col_offset, int end_lineno,
+                     int end_col_offset, PyArena *arena);
+expr_ty _PyAST_YieldFrom(expr_ty value, int lineno, int col_offset, int
+                         end_lineno, int end_col_offset, PyArena *arena);
+expr_ty _PyAST_Compare(expr_ty left, asdl_int_seq * ops, asdl_expr_seq *
+                       comparators, int lineno, int col_offset, int end_lineno,
+                       int end_col_offset, PyArena *arena);
+expr_ty _PyAST_Call(expr_ty func, asdl_expr_seq * args, asdl_keyword_seq *
+                    keywords, int lineno, int col_offset, int end_lineno, int
+                    end_col_offset, PyArena *arena);
+expr_ty _PyAST_FormattedValue(expr_ty value, int conversion, expr_ty
+                              format_spec, int lineno, int col_offset, int
+                              end_lineno, int end_col_offset, PyArena *arena);
+expr_ty _PyAST_JoinedStr(asdl_expr_seq * values, int lineno, int col_offset,
+                         int end_lineno, int end_col_offset, PyArena *arena);
+expr_ty _PyAST_Constant(constant value, string kind, int lineno, int
+                        col_offset, int end_lineno, int end_col_offset, PyArena
+                        *arena);
+expr_ty _PyAST_Attribute(expr_ty value, identifier attr, expr_context_ty ctx,
+                         int lineno, int col_offset, int end_lineno, int
+                         end_col_offset, PyArena *arena);
+expr_ty _PyAST_Subscript(expr_ty value, expr_ty slice, expr_context_ty ctx, int
+                         lineno, int col_offset, int end_lineno, int
+                         end_col_offset, PyArena *arena);
+expr_ty _PyAST_Starred(expr_ty value, expr_context_ty ctx, int lineno, int
+                       col_offset, int end_lineno, int end_col_offset, PyArena
+                       *arena);
+expr_ty _PyAST_Name(identifier id, expr_context_ty ctx, int lineno, int
+                    col_offset, int end_lineno, int end_col_offset, PyArena
+                    *arena);
+expr_ty _PyAST_List(asdl_expr_seq * elts, expr_context_ty ctx, int lineno, int
+                    col_offset, int end_lineno, int end_col_offset, PyArena
+                    *arena);
+expr_ty _PyAST_Tuple(asdl_expr_seq * elts, expr_context_ty ctx, int lineno, int
+                     col_offset, int end_lineno, int end_col_offset, PyArena
+                     *arena);
+expr_ty _PyAST_Slice(expr_ty lower, expr_ty upper, expr_ty step, int lineno,
+                     int col_offset, int end_lineno, int end_col_offset,
+                     PyArena *arena);
+comprehension_ty _PyAST_comprehension(expr_ty target, expr_ty iter,
+                                      asdl_expr_seq * ifs, int is_async,
+                                      PyArena *arena);
+excepthandler_ty _PyAST_ExceptHandler(expr_ty type, identifier name,
+                                      asdl_stmt_seq * body, int lineno, int
+                                      col_offset, int end_lineno, int
+                                      end_col_offset, PyArena *arena);
+arguments_ty _PyAST_arguments(asdl_arg_seq * posonlyargs, asdl_arg_seq * args,
+                              arg_ty vararg, asdl_arg_seq * kwonlyargs,
+                              asdl_expr_seq * kw_defaults, arg_ty kwarg,
+                              asdl_expr_seq * defaults, PyArena *arena);
+arg_ty _PyAST_arg(identifier arg, expr_ty annotation, string type_comment, int
+                  lineno, int col_offset, int end_lineno, int end_col_offset,
+                  PyArena *arena);
+keyword_ty _PyAST_keyword(identifier arg, expr_ty value, int lineno, int
+                          col_offset, int end_lineno, int end_col_offset,
+                          PyArena *arena);
+alias_ty _PyAST_alias(identifier name, identifier asname, int lineno, int
+                      col_offset, int end_lineno, int end_col_offset, PyArena
+                      *arena);
+withitem_ty _PyAST_withitem(expr_ty context_expr, expr_ty optional_vars,
+                            PyArena *arena);
+match_case_ty _PyAST_match_case(pattern_ty pattern, expr_ty guard,
+                                asdl_stmt_seq * body, PyArena *arena);
+pattern_ty _PyAST_MatchValue(expr_ty value, int lineno, int col_offset, int
+                             end_lineno, int end_col_offset, PyArena *arena);
+pattern_ty _PyAST_MatchSingleton(constant value, int lineno, int col_offset,
+                                 int end_lineno, int end_col_offset, PyArena
+                                 *arena);
+pattern_ty _PyAST_MatchSequence(asdl_pattern_seq * patterns, int lineno, int
+                                col_offset, int end_lineno, int end_col_offset,
+                                PyArena *arena);
+pattern_ty _PyAST_MatchMapping(asdl_expr_seq * keys, asdl_pattern_seq *
+                               patterns, identifier rest, int lineno, int
+                               col_offset, int end_lineno, int end_col_offset,
+                               PyArena *arena);
+pattern_ty _PyAST_MatchClass(expr_ty cls, asdl_pattern_seq * patterns,
+                             asdl_identifier_seq * kwd_attrs, asdl_pattern_seq
+                             * kwd_patterns, int lineno, int col_offset, int
+                             end_lineno, int end_col_offset, PyArena *arena);
+pattern_ty _PyAST_MatchStar(identifier name, int lineno, int col_offset, int
+                            end_lineno, int end_col_offset, PyArena *arena);
+pattern_ty _PyAST_MatchAs(pattern_ty pattern, identifier name, int lineno, int
+                          col_offset, int end_lineno, int end_col_offset,
+                          PyArena *arena);
+pattern_ty _PyAST_MatchOr(asdl_pattern_seq * patterns, int lineno, int
+                          col_offset, int end_lineno, int end_col_offset,
+                          PyArena *arena);
+type_ignore_ty _PyAST_TypeIgnore(int lineno, string tag, PyArena *arena);
+type_param_ty _PyAST_TypeVar(identifier name, expr_ty bound, expr_ty
+                             default_value, int lineno, int col_offset, int
+                             end_lineno, int end_col_offset, PyArena *arena);
+type_param_ty _PyAST_ParamSpec(identifier name, expr_ty default_value, int
+                               lineno, int col_offset, int end_lineno, int
+                               end_col_offset, PyArena *arena);
+type_param_ty _PyAST_TypeVarTuple(identifier name, expr_ty default_value, int
+                                  lineno, int col_offset, int end_lineno, int
+                                  end_col_offset, PyArena *arena);
+
+
+PyObject* PyAST_mod2obj(mod_ty t);
+mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode);
+int PyAST_Check(PyObject* obj);
+
+extern int _PyAST_Validate(mod_ty);
+
+/* _PyAST_ExprAsUnicode is defined in ast_unparse.c */
+extern PyObject* _PyAST_ExprAsUnicode(expr_ty);
+
+/* Return the borrowed reference to the first literal string in the
+   sequence of statements or NULL if it doesn't start from a literal string.
+   Doesn't set exception. */
+extern PyObject* _PyAST_GetDocString(asdl_stmt_seq *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_AST_H */
diff --git a/Include/internal/pycore_ast_state.h b/Include/internal/pycore_ast_state.h
new file mode 100644
index 0000000000000000000000000000000000000000..09ae95465495c01e32a78d00eac8989786762b08
--- /dev/null
+++ b/Include/internal/pycore_ast_state.h
@@ -0,0 +1,268 @@
+// File automatically generated by Parser/asdl_c.py.
+
+#ifndef Py_INTERNAL_AST_STATE_H
+#define Py_INTERNAL_AST_STATE_H
+
+#include "pycore_lock.h"    // _PyOnceFlag
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+struct ast_state {
+    _PyOnceFlag once;
+    int finalized;
+    PyObject *AST_type;
+    PyObject *Add_singleton;
+    PyObject *Add_type;
+    PyObject *And_singleton;
+    PyObject *And_type;
+    PyObject *AnnAssign_type;
+    PyObject *Assert_type;
+    PyObject *Assign_type;
+    PyObject *AsyncFor_type;
+    PyObject *AsyncFunctionDef_type;
+    PyObject *AsyncWith_type;
+    PyObject *Attribute_type;
+    PyObject *AugAssign_type;
+    PyObject *Await_type;
+    PyObject *BinOp_type;
+    PyObject *BitAnd_singleton;
+    PyObject *BitAnd_type;
+    PyObject *BitOr_singleton;
+    PyObject *BitOr_type;
+    PyObject *BitXor_singleton;
+    PyObject *BitXor_type;
+    PyObject *BoolOp_type;
+    PyObject *Break_type;
+    PyObject *Call_type;
+    PyObject *ClassDef_type;
+    PyObject *Compare_type;
+    PyObject *Constant_type;
+    PyObject *Continue_type;
+    PyObject *Del_singleton;
+    PyObject *Del_type;
+    PyObject *Delete_type;
+    PyObject *DictComp_type;
+    PyObject *Dict_type;
+    PyObject *Div_singleton;
+    PyObject *Div_type;
+    PyObject *Eq_singleton;
+    PyObject *Eq_type;
+    PyObject *ExceptHandler_type;
+    PyObject *Expr_type;
+    PyObject *Expression_type;
+    PyObject *FloorDiv_singleton;
+    PyObject *FloorDiv_type;
+    PyObject *For_type;
+    PyObject *FormattedValue_type;
+    PyObject *FunctionDef_type;
+    PyObject *FunctionType_type;
+    PyObject *GeneratorExp_type;
+    PyObject *Global_type;
+    PyObject *GtE_singleton;
+    PyObject *GtE_type;
+    PyObject *Gt_singleton;
+    PyObject *Gt_type;
+    PyObject *IfExp_type;
+    PyObject *If_type;
+    PyObject *ImportFrom_type;
+    PyObject *Import_type;
+    PyObject *In_singleton;
+    PyObject *In_type;
+    PyObject *Interactive_type;
+    PyObject *Invert_singleton;
+    PyObject *Invert_type;
+    PyObject *IsNot_singleton;
+    PyObject *IsNot_type;
+    PyObject *Is_singleton;
+    PyObject *Is_type;
+    PyObject *JoinedStr_type;
+    PyObject *LShift_singleton;
+    PyObject *LShift_type;
+    PyObject *Lambda_type;
+    PyObject *ListComp_type;
+    PyObject *List_type;
+    PyObject *Load_singleton;
+    PyObject *Load_type;
+    PyObject *LtE_singleton;
+    PyObject *LtE_type;
+    PyObject *Lt_singleton;
+    PyObject *Lt_type;
+    PyObject *MatMult_singleton;
+    PyObject *MatMult_type;
+    PyObject *MatchAs_type;
+    PyObject *MatchClass_type;
+    PyObject *MatchMapping_type;
+    PyObject *MatchOr_type;
+    PyObject *MatchSequence_type;
+    PyObject *MatchSingleton_type;
+    PyObject *MatchStar_type;
+    PyObject *MatchValue_type;
+    PyObject *Match_type;
+    PyObject *Mod_singleton;
+    PyObject *Mod_type;
+    PyObject *Module_type;
+    PyObject *Mult_singleton;
+    PyObject *Mult_type;
+    PyObject *Name_type;
+    PyObject *NamedExpr_type;
+    PyObject *Nonlocal_type;
+    PyObject *NotEq_singleton;
+    PyObject *NotEq_type;
+    PyObject *NotIn_singleton;
+    PyObject *NotIn_type;
+    PyObject *Not_singleton;
+    PyObject *Not_type;
+    PyObject *Or_singleton;
+    PyObject *Or_type;
+    PyObject *ParamSpec_type;
+    PyObject *Pass_type;
+    PyObject *Pow_singleton;
+    PyObject *Pow_type;
+    PyObject *RShift_singleton;
+    PyObject *RShift_type;
+    PyObject *Raise_type;
+    PyObject *Return_type;
+    PyObject *SetComp_type;
+    PyObject *Set_type;
+    PyObject *Slice_type;
+    PyObject *Starred_type;
+    PyObject *Store_singleton;
+    PyObject *Store_type;
+    PyObject *Sub_singleton;
+    PyObject *Sub_type;
+    PyObject *Subscript_type;
+    PyObject *TryStar_type;
+    PyObject *Try_type;
+    PyObject *Tuple_type;
+    PyObject *TypeAlias_type;
+    PyObject *TypeIgnore_type;
+    PyObject *TypeVarTuple_type;
+    PyObject *TypeVar_type;
+    PyObject *UAdd_singleton;
+    PyObject *UAdd_type;
+    PyObject *USub_singleton;
+    PyObject *USub_type;
+    PyObject *UnaryOp_type;
+    PyObject *While_type;
+    PyObject *With_type;
+    PyObject *YieldFrom_type;
+    PyObject *Yield_type;
+    PyObject *__dict__;
+    PyObject *__doc__;
+    PyObject *__match_args__;
+    PyObject *__module__;
+    PyObject *_attributes;
+    PyObject *_fields;
+    PyObject *alias_type;
+    PyObject *annotation;
+    PyObject *arg;
+    PyObject *arg_type;
+    PyObject *args;
+    PyObject *argtypes;
+    PyObject *arguments_type;
+    PyObject *asname;
+    PyObject *ast;
+    PyObject *attr;
+    PyObject *bases;
+    PyObject *body;
+    PyObject *boolop_type;
+    PyObject *bound;
+    PyObject *cases;
+    PyObject *cause;
+    PyObject *cls;
+    PyObject *cmpop_type;
+    PyObject *col_offset;
+    PyObject *comparators;
+    PyObject *comprehension_type;
+    PyObject *context_expr;
+    PyObject *conversion;
+    PyObject *ctx;
+    PyObject *decorator_list;
+    PyObject *default_value;
+    PyObject *defaults;
+    PyObject *elt;
+    PyObject *elts;
+    PyObject *end_col_offset;
+    PyObject *end_lineno;
+    PyObject *exc;
+    PyObject *excepthandler_type;
+    PyObject *expr_context_type;
+    PyObject *expr_type;
+    PyObject *finalbody;
+    PyObject *format_spec;
+    PyObject *func;
+    PyObject *generators;
+    PyObject *guard;
+    PyObject *handlers;
+    PyObject *id;
+    PyObject *ifs;
+    PyObject *is_async;
+    PyObject *items;
+    PyObject *iter;
+    PyObject *key;
+    PyObject *keys;
+    PyObject *keyword_type;
+    PyObject *keywords;
+    PyObject *kind;
+    PyObject *kw_defaults;
+    PyObject *kwarg;
+    PyObject *kwd_attrs;
+    PyObject *kwd_patterns;
+    PyObject *kwonlyargs;
+    PyObject *left;
+    PyObject *level;
+    PyObject *lineno;
+    PyObject *lower;
+    PyObject *match_case_type;
+    PyObject *mod_type;
+    PyObject *module;
+    PyObject *msg;
+    PyObject *name;
+    PyObject *names;
+    PyObject *op;
+    PyObject *operand;
+    PyObject *operator_type;
+    PyObject *ops;
+    PyObject *optional_vars;
+    PyObject *orelse;
+    PyObject *pattern;
+    PyObject *pattern_type;
+    PyObject *patterns;
+    PyObject *posonlyargs;
+    PyObject *rest;
+    PyObject *returns;
+    PyObject *right;
+    PyObject *simple;
+    PyObject *slice;
+    PyObject *step;
+    PyObject *stmt_type;
+    PyObject *subject;
+    PyObject *tag;
+    PyObject *target;
+    PyObject *targets;
+    PyObject *test;
+    PyObject *type;
+    PyObject *type_comment;
+    PyObject *type_ignore_type;
+    PyObject *type_ignores;
+    PyObject *type_param_type;
+    PyObject *type_params;
+    PyObject *unaryop_type;
+    PyObject *upper;
+    PyObject *value;
+    PyObject *values;
+    PyObject *vararg;
+    PyObject *withitem_type;
+};
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_AST_STATE_H */
+
diff --git a/Include/internal/pycore_atexit.h b/Include/internal/pycore_atexit.h
new file mode 100644
index 0000000000000000000000000000000000000000..72c66a059395009b6985158b42221ef9c7774d5c
--- /dev/null
+++ b/Include/internal/pycore_atexit.h
@@ -0,0 +1,67 @@
+#ifndef Py_INTERNAL_ATEXIT_H
+#define Py_INTERNAL_ATEXIT_H
+
+#include "pycore_lock.h"        // PyMutex
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+//###############
+// runtime atexit
+
+typedef void (*atexit_callbackfunc)(void);
+
+struct _atexit_runtime_state {
+    PyMutex mutex;
+#define NEXITFUNCS 32
+    atexit_callbackfunc callbacks[NEXITFUNCS];
+    int ncallbacks;
+};
+
+
+//###################
+// interpreter atexit
+
+typedef void (*atexit_datacallbackfunc)(void *);
+
+typedef struct atexit_callback {
+    atexit_datacallbackfunc func;
+    void *data;
+    struct atexit_callback *next;
+} atexit_callback;
+
+typedef struct {
+    PyObject *func;
+    PyObject *args;
+    PyObject *kwargs;
+} atexit_py_callback;
+
+struct atexit_state {
+    atexit_callback *ll_callbacks;
+    // Kept for ABI compatibility--do not use! (See GH-127791.)
+    atexit_callback *last_ll_callback;
+
+    // XXX The rest of the state could be moved to the atexit module state
+    // and a low-level callback added for it during module exec.
+    // For the moment we leave it here.
+    atexit_py_callback **callbacks;
+    int ncallbacks;
+    int callback_len;
+};
+
+// Export for '_interpchannels' shared extension
+PyAPI_FUNC(int) _Py_AtExit(
+    PyInterpreterState *interp,
+    atexit_datacallbackfunc func,
+    void *data);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_ATEXIT_H */
diff --git a/Include/internal/pycore_backoff.h b/Include/internal/pycore_backoff.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bcca1e769b341f021f97aafb2a5a33512f5d47f
--- /dev/null
+++ b/Include/internal/pycore_backoff.h
@@ -0,0 +1,145 @@
+
+#ifndef Py_INTERNAL_BACKOFF_H
+#define Py_INTERNAL_BACKOFF_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+
+typedef struct {
+    union {
+        struct {
+            uint16_t backoff : 4;
+            uint16_t value : 12;
+        };
+        uint16_t as_counter;  // For printf("%#x", ...)
+    };
+} _Py_BackoffCounter;
+
+
+/* 16-bit countdown counters using exponential backoff.
+
+   These are used by the adaptive specializer to count down until
+   it is time to specialize an instruction. If specialization fails
+   the counter is reset using exponential backoff.
+
+   Another use is for the Tier 2 optimizer to decide when to create
+   a new Tier 2 trace (executor). Again, exponential backoff is used.
+
+   The 16-bit counter is structured as a 12-bit unsigned 'value'
+   and a 4-bit 'backoff' field. When resetting the counter, the
+   backoff field is incremented (until it reaches a limit) and the
+   value is set to a bit mask representing the value 2**backoff - 1.
+   The maximum backoff is 12 (the number of value bits).
+
+   There is an exceptional value which must not be updated, 0xFFFF.
+*/
+
+#define UNREACHABLE_BACKOFF 0xFFFF
+
+static inline bool
+is_unreachable_backoff_counter(_Py_BackoffCounter counter)
+{
+    return counter.as_counter == UNREACHABLE_BACKOFF;
+}
+
+static inline _Py_BackoffCounter
+make_backoff_counter(uint16_t value, uint16_t backoff)
+{
+    assert(backoff <= 15);
+    assert(value <= 0xFFF);
+    _Py_BackoffCounter result;
+    result.value = value;
+    result.backoff = backoff;
+    return result;
+}
+
+static inline _Py_BackoffCounter
+forge_backoff_counter(uint16_t counter)
+{
+    _Py_BackoffCounter result;
+    result.as_counter = counter;
+    return result;
+}
+
+static inline _Py_BackoffCounter
+restart_backoff_counter(_Py_BackoffCounter counter)
+{
+    assert(!is_unreachable_backoff_counter(counter));
+    if (counter.backoff < 12) {
+        return make_backoff_counter((1 << (counter.backoff + 1)) - 1, counter.backoff + 1);
+    }
+    else {
+        return make_backoff_counter((1 << 12) - 1, 12);
+    }
+}
+
+static inline _Py_BackoffCounter
+pause_backoff_counter(_Py_BackoffCounter counter)
+{
+    return make_backoff_counter(counter.value | 1, counter.backoff);
+}
+
+static inline _Py_BackoffCounter
+advance_backoff_counter(_Py_BackoffCounter counter)
+{
+    if (!is_unreachable_backoff_counter(counter)) {
+        return make_backoff_counter((counter.value - 1) & 0xFFF, counter.backoff);
+    }
+    else {
+        return counter;
+    }
+}
+
+static inline bool
+backoff_counter_triggers(_Py_BackoffCounter counter)
+{
+    return counter.value == 0;
+}
+
+/* Initial JUMP_BACKWARD counter.
+ * This determines when we create a trace for a loop.
+* Backoff sequence 16, 32, 64, 128, 256, 512, 1024, 2048, 4096. */
+#define JUMP_BACKWARD_INITIAL_VALUE 16
+#define JUMP_BACKWARD_INITIAL_BACKOFF 4
+static inline _Py_BackoffCounter
+initial_jump_backoff_counter(void)
+{
+    return make_backoff_counter(JUMP_BACKWARD_INITIAL_VALUE,
+                                JUMP_BACKWARD_INITIAL_BACKOFF);
+}
+
+/* Initial exit temperature.
+ * Must be larger than ADAPTIVE_COOLDOWN_VALUE,
+ * otherwise when a side exit warms up we may construct
+ * a new trace before the Tier 1 code has properly re-specialized.
+ * Backoff sequence 64, 128, 256, 512, 1024, 2048, 4096. */
+#define COLD_EXIT_INITIAL_VALUE 64
+#define COLD_EXIT_INITIAL_BACKOFF 6
+
+static inline _Py_BackoffCounter
+initial_temperature_backoff_counter(void)
+{
+    return make_backoff_counter(COLD_EXIT_INITIAL_VALUE,
+                                COLD_EXIT_INITIAL_BACKOFF);
+}
+
+/* Unreachable backoff counter. */
+static inline _Py_BackoffCounter
+initial_unreachable_backoff_counter(void)
+{
+    return forge_backoff_counter(UNREACHABLE_BACKOFF);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_BACKOFF_H */
diff --git a/Include/internal/pycore_bitutils.h b/Include/internal/pycore_bitutils.h
new file mode 100644
index 0000000000000000000000000000000000000000..50f69377523818ff1177b648a6efb6427b51ff98
--- /dev/null
+++ b/Include/internal/pycore_bitutils.h
@@ -0,0 +1,186 @@
+/* Bit and bytes utilities.
+
+   Bytes swap functions, reverse order of bytes:
+
+   - _Py_bswap16(uint16_t)
+   - _Py_bswap32(uint32_t)
+   - _Py_bswap64(uint64_t)
+*/
+
+#ifndef Py_INTERNAL_BITUTILS_H
+#define Py_INTERNAL_BITUTILS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#if defined(__GNUC__) \
+      && ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8))
+   /* __builtin_bswap16() is available since GCC 4.8,
+      __builtin_bswap32() is available since GCC 4.3,
+      __builtin_bswap64() is available since GCC 4.3. */
+#  define _PY_HAVE_BUILTIN_BSWAP
+#endif
+
+#ifdef _MSC_VER
+#  include <intrin.h>             // _byteswap_uint64()
+#endif
+
+
+static inline uint16_t
+_Py_bswap16(uint16_t word)
+{
+#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap16)
+    return __builtin_bswap16(word);
+#elif defined(_MSC_VER)
+    Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned short));
+    return _byteswap_ushort(word);
+#else
+    // Portable implementation which doesn't rely on circular bit shift
+    return ( ((word & UINT16_C(0x00FF)) << 8)
+           | ((word & UINT16_C(0xFF00)) >> 8));
+#endif
+}
+
+static inline uint32_t
+_Py_bswap32(uint32_t word)
+{
+#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap32)
+    return __builtin_bswap32(word);
+#elif defined(_MSC_VER)
+    Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned long));
+    return _byteswap_ulong(word);
+#else
+    // Portable implementation which doesn't rely on circular bit shift
+    return ( ((word & UINT32_C(0x000000FF)) << 24)
+           | ((word & UINT32_C(0x0000FF00)) <<  8)
+           | ((word & UINT32_C(0x00FF0000)) >>  8)
+           | ((word & UINT32_C(0xFF000000)) >> 24));
+#endif
+}
+
+static inline uint64_t
+_Py_bswap64(uint64_t word)
+{
+#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap64)
+    return __builtin_bswap64(word);
+#elif defined(_MSC_VER)
+    return _byteswap_uint64(word);
+#else
+    // Portable implementation which doesn't rely on circular bit shift
+    return ( ((word & UINT64_C(0x00000000000000FF)) << 56)
+           | ((word & UINT64_C(0x000000000000FF00)) << 40)
+           | ((word & UINT64_C(0x0000000000FF0000)) << 24)
+           | ((word & UINT64_C(0x00000000FF000000)) <<  8)
+           | ((word & UINT64_C(0x000000FF00000000)) >>  8)
+           | ((word & UINT64_C(0x0000FF0000000000)) >> 24)
+           | ((word & UINT64_C(0x00FF000000000000)) >> 40)
+           | ((word & UINT64_C(0xFF00000000000000)) >> 56));
+#endif
+}
+
+
+// Population count: count the number of 1's in 'x'
+// (number of bits set to 1), also known as the hamming weight.
+//
+// Implementation note. CPUID is not used, to test if x86 POPCNT instruction
+// can be used, to keep the implementation simple. For example, Visual Studio
+// __popcnt() is not used this reason. The clang and GCC builtin function can
+// use the x86 POPCNT instruction if the target architecture has SSE4a or
+// newer.
+static inline int
+_Py_popcount32(uint32_t x)
+{
+#if (defined(__clang__) || defined(__GNUC__))
+
+#if SIZEOF_INT >= 4
+    Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));
+    return __builtin_popcount(x);
+#else
+    // The C standard guarantees that unsigned long will always be big enough
+    // to hold a uint32_t value without losing information.
+    Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));
+    return __builtin_popcountl(x);
+#endif
+
+#else
+    // 32-bit SWAR (SIMD Within A Register) popcount
+
+    // Binary: 0 1 0 1 ...
+    const uint32_t M1 = 0x55555555;
+    // Binary: 00 11 00 11. ..
+    const uint32_t M2 = 0x33333333;
+    // Binary: 0000 1111 0000 1111 ...
+    const uint32_t M4 = 0x0F0F0F0F;
+
+    // Put count of each 2 bits into those 2 bits
+    x = x - ((x >> 1) & M1);
+    // Put count of each 4 bits into those 4 bits
+    x = (x & M2) + ((x >> 2) & M2);
+    // Put count of each 8 bits into those 8 bits
+    x = (x + (x >> 4)) & M4;
+    // Sum of the 4 byte counts.
+    // Take care when considering changes to the next line. Portability and
+    // correctness are delicate here, thanks to C's "integer promotions" (C99
+    // §6.3.1.1p2). On machines where the `int` type has width greater than 32
+    // bits, `x` will be promoted to an `int`, and following C's "usual
+    // arithmetic conversions" (C99 §6.3.1.8), the multiplication will be
+    // performed as a multiplication of two `unsigned int` operands. In this
+    // case it's critical that we cast back to `uint32_t` in order to keep only
+    // the least significant 32 bits. On machines where the `int` type has
+    // width no greater than 32, the multiplication is of two 32-bit unsigned
+    // integer types, and the (uint32_t) cast is a no-op. In both cases, we
+    // avoid the risk of undefined behaviour due to overflow of a
+    // multiplication of signed integer types.
+    return (uint32_t)(x * 0x01010101U) >> 24;
+#endif
+}
+
+
+// Return the index of the most significant 1 bit in 'x'. This is the smallest
+// integer k such that x < 2**k. Equivalent to floor(log2(x)) + 1 for x != 0.
+static inline int
+_Py_bit_length(unsigned long x)
+{
+#if (defined(__clang__) || defined(__GNUC__))
+    if (x != 0) {
+        // __builtin_clzl() is available since GCC 3.4.
+        // Undefined behavior for x == 0.
+        return (int)sizeof(unsigned long) * 8 - __builtin_clzl(x);
+    }
+    else {
+        return 0;
+    }
+#elif defined(_MSC_VER)
+    // _BitScanReverse() is documented to search 32 bits.
+    Py_BUILD_ASSERT(sizeof(unsigned long) <= 4);
+    unsigned long msb;
+    if (_BitScanReverse(&msb, x)) {
+        return (int)msb + 1;
+    }
+    else {
+        return 0;
+    }
+#else
+    const int BIT_LENGTH_TABLE[32] = {
+        0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+    };
+    int msb = 0;
+    while (x >= 32) {
+        msb += 6;
+        x >>= 6;
+    }
+    msb += BIT_LENGTH_TABLE[x];
+    return msb;
+#endif
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_BITUTILS_H */
diff --git a/Include/internal/pycore_blocks_output_buffer.h b/Include/internal/pycore_blocks_output_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..573e10359b7bd271c2da4c9674a643b8e738ae41
--- /dev/null
+++ b/Include/internal/pycore_blocks_output_buffer.h
@@ -0,0 +1,321 @@
+/*
+   _BlocksOutputBuffer is used to maintain an output buffer
+   that has unpredictable size. Suitable for compression/decompression
+   API (bz2/lzma/zlib) that has stream->next_out and stream->avail_out:
+
+        stream->next_out:  point to the next output position.
+        stream->avail_out: the number of available bytes left in the buffer.
+
+   It maintains a list of bytes object, so there is no overhead of resizing
+   the buffer.
+
+   Usage:
+
+   1, Initialize the struct instance like this:
+        _BlocksOutputBuffer buffer = {.list = NULL};
+      Set .list to NULL for _BlocksOutputBuffer_OnError()
+
+   2, Initialize the buffer use one of these functions:
+        _BlocksOutputBuffer_InitAndGrow()
+        _BlocksOutputBuffer_InitWithSize()
+
+   3, If (avail_out == 0), grow the buffer:
+        _BlocksOutputBuffer_Grow()
+
+   4, Get the current outputted data size:
+        _BlocksOutputBuffer_GetDataSize()
+
+   5, Finish the buffer, and return a bytes object:
+        _BlocksOutputBuffer_Finish()
+
+   6, Clean up the buffer when an error occurred:
+        _BlocksOutputBuffer_OnError()
+*/
+
+#ifndef Py_INTERNAL_BLOCKS_OUTPUT_BUFFER_H
+#define Py_INTERNAL_BLOCKS_OUTPUT_BUFFER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "Python.h"
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+typedef struct {
+    // List of bytes objects
+    PyObject *list;
+    // Number of whole allocated size
+    Py_ssize_t allocated;
+    // Max length of the buffer, negative number means unlimited length.
+    Py_ssize_t max_length;
+} _BlocksOutputBuffer;
+
+static const char unable_allocate_msg[] = "Unable to allocate output buffer.";
+
+/* In 32-bit build, the max block size should <= INT32_MAX. */
+#define OUTPUT_BUFFER_MAX_BLOCK_SIZE (256*1024*1024)
+
+/* Block size sequence */
+#define KB (1024)
+#define MB (1024*1024)
+static const Py_ssize_t BUFFER_BLOCK_SIZE[] =
+    { 32*KB, 64*KB, 256*KB, 1*MB, 4*MB, 8*MB, 16*MB, 16*MB,
+      32*MB, 32*MB, 32*MB, 32*MB, 64*MB, 64*MB, 128*MB, 128*MB,
+      OUTPUT_BUFFER_MAX_BLOCK_SIZE };
+#undef KB
+#undef MB
+
+/* According to the block sizes defined by BUFFER_BLOCK_SIZE, the whole
+   allocated size growth step is:
+    1   32 KB       +32 KB
+    2   96 KB       +64 KB
+    3   352 KB      +256 KB
+    4   1.34 MB     +1 MB
+    5   5.34 MB     +4 MB
+    6   13.34 MB    +8 MB
+    7   29.34 MB    +16 MB
+    8   45.34 MB    +16 MB
+    9   77.34 MB    +32 MB
+    10  109.34 MB   +32 MB
+    11  141.34 MB   +32 MB
+    12  173.34 MB   +32 MB
+    13  237.34 MB   +64 MB
+    14  301.34 MB   +64 MB
+    15  429.34 MB   +128 MB
+    16  557.34 MB   +128 MB
+    17  813.34 MB   +256 MB
+    18  1069.34 MB  +256 MB
+    19  1325.34 MB  +256 MB
+    20  1581.34 MB  +256 MB
+    21  1837.34 MB  +256 MB
+    22  2093.34 MB  +256 MB
+    ...
+*/
+
+/* Initialize the buffer, and grow the buffer.
+
+   max_length: Max length of the buffer, -1 for unlimited length.
+
+   On success, return allocated size (>=0)
+   On failure, return -1
+*/
+static inline Py_ssize_t
+_BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer,
+                                const Py_ssize_t max_length,
+                                void **next_out)
+{
+    PyObject *b;
+    Py_ssize_t block_size;
+
+    // ensure .list was set to NULL
+    assert(buffer->list == NULL);
+
+    // get block size
+    if (0 <= max_length && max_length < BUFFER_BLOCK_SIZE[0]) {
+        block_size = max_length;
+    } else {
+        block_size = BUFFER_BLOCK_SIZE[0];
+    }
+
+    // the first block
+    b = PyBytes_FromStringAndSize(NULL, block_size);
+    if (b == NULL) {
+        return -1;
+    }
+
+    // create the list
+    buffer->list = PyList_New(1);
+    if (buffer->list == NULL) {
+        Py_DECREF(b);
+        return -1;
+    }
+    PyList_SET_ITEM(buffer->list, 0, b);
+
+    // set variables
+    buffer->allocated = block_size;
+    buffer->max_length = max_length;
+
+    *next_out = PyBytes_AS_STRING(b);
+    return block_size;
+}
+
+/* Initialize the buffer, with an initial size.
+
+   Check block size limit in the outer wrapper function. For example, some libs
+   accept UINT32_MAX as the maximum block size, then init_size should <= it.
+
+   On success, return allocated size (>=0)
+   On failure, return -1
+*/
+static inline Py_ssize_t
+_BlocksOutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer,
+                                 const Py_ssize_t init_size,
+                                 void **next_out)
+{
+    PyObject *b;
+
+    // ensure .list was set to NULL
+    assert(buffer->list == NULL);
+
+    // the first block
+    b = PyBytes_FromStringAndSize(NULL, init_size);
+    if (b == NULL) {
+        PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
+        return -1;
+    }
+
+    // create the list
+    buffer->list = PyList_New(1);
+    if (buffer->list == NULL) {
+        Py_DECREF(b);
+        return -1;
+    }
+    PyList_SET_ITEM(buffer->list, 0, b);
+
+    // set variables
+    buffer->allocated = init_size;
+    buffer->max_length = -1;
+
+    *next_out = PyBytes_AS_STRING(b);
+    return init_size;
+}
+
+/* Grow the buffer. The avail_out must be 0, please check it before calling.
+
+   On success, return allocated size (>=0)
+   On failure, return -1
+*/
+static inline Py_ssize_t
+_BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
+                         void **next_out,
+                         const Py_ssize_t avail_out)
+{
+    PyObject *b;
+    const Py_ssize_t list_len = Py_SIZE(buffer->list);
+    Py_ssize_t block_size;
+
+    // ensure no gaps in the data
+    if (avail_out != 0) {
+        PyErr_SetString(PyExc_SystemError,
+                        "avail_out is non-zero in _BlocksOutputBuffer_Grow().");
+        return -1;
+    }
+
+    // get block size
+    if (list_len < (Py_ssize_t) Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) {
+        block_size = BUFFER_BLOCK_SIZE[list_len];
+    } else {
+        block_size = BUFFER_BLOCK_SIZE[Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE) - 1];
+    }
+
+    // check max_length
+    if (buffer->max_length >= 0) {
+        // if (rest == 0), should not grow the buffer.
+        Py_ssize_t rest = buffer->max_length - buffer->allocated;
+        assert(rest > 0);
+
+        // block_size of the last block
+        if (block_size > rest) {
+            block_size = rest;
+        }
+    }
+
+    // check buffer->allocated overflow
+    if (block_size > PY_SSIZE_T_MAX - buffer->allocated) {
+        PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
+        return -1;
+    }
+
+    // create the block
+    b = PyBytes_FromStringAndSize(NULL, block_size);
+    if (b == NULL) {
+        PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
+        return -1;
+    }
+    if (PyList_Append(buffer->list, b) < 0) {
+        Py_DECREF(b);
+        return -1;
+    }
+    Py_DECREF(b);
+
+    // set variables
+    buffer->allocated += block_size;
+
+    *next_out = PyBytes_AS_STRING(b);
+    return block_size;
+}
+
+/* Return the current outputted data size. */
+static inline Py_ssize_t
+_BlocksOutputBuffer_GetDataSize(_BlocksOutputBuffer *buffer,
+                                const Py_ssize_t avail_out)
+{
+    return buffer->allocated - avail_out;
+}
+
+/* Finish the buffer.
+
+   Return a bytes object on success
+   Return NULL on failure
+*/
+static inline PyObject *
+_BlocksOutputBuffer_Finish(_BlocksOutputBuffer *buffer,
+                           const Py_ssize_t avail_out)
+{
+    PyObject *result, *block;
+    const Py_ssize_t list_len = Py_SIZE(buffer->list);
+
+    // fast path for single block
+    if ((list_len == 1 && avail_out == 0) ||
+        (list_len == 2 && Py_SIZE(PyList_GET_ITEM(buffer->list, 1)) == avail_out))
+    {
+        block = PyList_GET_ITEM(buffer->list, 0);
+        Py_INCREF(block);
+
+        Py_CLEAR(buffer->list);
+        return block;
+    }
+
+    // final bytes object
+    result = PyBytes_FromStringAndSize(NULL, buffer->allocated - avail_out);
+    if (result == NULL) {
+        PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
+        return NULL;
+    }
+
+    // memory copy
+    if (list_len > 0) {
+        char *posi = PyBytes_AS_STRING(result);
+
+        // blocks except the last one
+        Py_ssize_t i = 0;
+        for (; i < list_len-1; i++) {
+            block = PyList_GET_ITEM(buffer->list, i);
+            memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block));
+            posi += Py_SIZE(block);
+        }
+        // the last block
+        block = PyList_GET_ITEM(buffer->list, i);
+        memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block) - avail_out);
+    } else {
+        assert(Py_SIZE(result) == 0);
+    }
+
+    Py_CLEAR(buffer->list);
+    return result;
+}
+
+/* Clean up the buffer when an error occurred. */
+static inline void
+_BlocksOutputBuffer_OnError(_BlocksOutputBuffer *buffer)
+{
+    Py_CLEAR(buffer->list);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* Py_INTERNAL_BLOCKS_OUTPUT_BUFFER_H */
diff --git a/Include/internal/pycore_brc.h b/Include/internal/pycore_brc.h
new file mode 100644
index 0000000000000000000000000000000000000000..3453d83b57ca97e5dcc0ca4c98a1ced4e8b0fc1c
--- /dev/null
+++ b/Include/internal/pycore_brc.h
@@ -0,0 +1,74 @@
+#ifndef Py_INTERNAL_BRC_H
+#define Py_INTERNAL_BRC_H
+
+#include <stdint.h>
+#include "pycore_llist.h"           // struct llist_node
+#include "pycore_lock.h"            // PyMutex
+#include "pycore_object_stack.h"    // _PyObjectStack
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef Py_GIL_DISABLED
+
+// Prime number to avoid correlations with memory addresses.
+#define _Py_BRC_NUM_BUCKETS 257
+
+// Hash table bucket
+struct _brc_bucket {
+    // Mutex protects both the bucket and thread state queues in this bucket.
+    PyMutex mutex;
+
+    // Linked list of _PyThreadStateImpl objects hashed to this bucket.
+    struct llist_node root;
+};
+
+// Per-interpreter biased reference counting state
+struct _brc_state {
+    // Hash table of thread states by thread-id. Thread states within a bucket
+    // are chained using a doubly-linked list.
+    struct _brc_bucket table[_Py_BRC_NUM_BUCKETS];
+};
+
+// Per-thread biased reference counting state
+struct _brc_thread_state {
+    // Linked-list of thread states per hash bucket
+    struct llist_node bucket_node;
+
+    // Thread-id as determined by _PyThread_Id()
+    uintptr_t tid;
+
+    // Objects with refcounts to be merged (protected by bucket mutex)
+    _PyObjectStack objects_to_merge;
+
+    // Local stack of objects to be merged (not accessed by other threads)
+    _PyObjectStack local_objects_to_merge;
+};
+
+// Initialize/finalize the per-thread biased reference counting state
+void _Py_brc_init_thread(PyThreadState *tstate);
+void _Py_brc_remove_thread(PyThreadState *tstate);
+
+// Initialize per-interpreter state
+void _Py_brc_init_state(PyInterpreterState *interp);
+
+void _Py_brc_after_fork(PyInterpreterState *interp);
+
+// Enqueues an object to be merged by it's owning thread (tid). This
+// steals a reference to the object.
+void _Py_brc_queue_object(PyObject *ob);
+
+// Merge the refcounts of queued objects for the current thread.
+void _Py_brc_merge_refcounts(PyThreadState *tstate);
+
+#endif /* Py_GIL_DISABLED */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_BRC_H */
diff --git a/Include/internal/pycore_bytes_methods.h b/Include/internal/pycore_bytes_methods.h
new file mode 100644
index 0000000000000000000000000000000000000000..059dc2599bbd77e861dbf0649372281e306de75e
--- /dev/null
+++ b/Include/internal/pycore_bytes_methods.h
@@ -0,0 +1,82 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_BYTES_CTYPE_H
+#define Py_BYTES_CTYPE_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/*
+ * The internal implementation behind PyBytes (bytes) and PyByteArray (bytearray)
+ * methods of the given names, they operate on ASCII byte strings.
+ */
+extern PyObject* _Py_bytes_isspace(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_isalpha(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_isalnum(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_isascii(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_isdigit(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_islower(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_isupper(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_istitle(const char *cptr, Py_ssize_t len);
+
+/* These store their len sized answer in the given preallocated *result arg. */
+extern void _Py_bytes_lower(char *result, const char *cptr, Py_ssize_t len);
+extern void _Py_bytes_upper(char *result, const char *cptr, Py_ssize_t len);
+extern void _Py_bytes_title(char *result, const char *s, Py_ssize_t len);
+extern void _Py_bytes_capitalize(char *result, const char *s, Py_ssize_t len);
+extern void _Py_bytes_swapcase(char *result, const char *s, Py_ssize_t len);
+
+extern PyObject *_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *sub,
+                                Py_ssize_t start, Py_ssize_t end);
+extern PyObject *_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *sub,
+                                 Py_ssize_t start, Py_ssize_t end);
+extern PyObject *_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *sub,
+                                 Py_ssize_t start, Py_ssize_t end);
+extern PyObject *_Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *sub,
+                                 Py_ssize_t start, Py_ssize_t end);
+extern PyObject *_Py_bytes_count(const char *str, Py_ssize_t len, PyObject *sub,
+                                 Py_ssize_t start, Py_ssize_t end);
+extern int _Py_bytes_contains(const char *str, Py_ssize_t len, PyObject *arg);
+extern PyObject *_Py_bytes_startswith(const char *str, Py_ssize_t len,
+                                      PyObject *subobj, Py_ssize_t start,
+                                      Py_ssize_t end);
+extern PyObject *_Py_bytes_endswith(const char *str, Py_ssize_t len,
+                                    PyObject *subobj, Py_ssize_t start,
+                                    Py_ssize_t end);
+
+/* The maketrans() static method. */
+extern PyObject* _Py_bytes_maketrans(Py_buffer *frm, Py_buffer *to);
+
+/* Shared __doc__ strings. */
+extern const char _Py_isspace__doc__[];
+extern const char _Py_isalpha__doc__[];
+extern const char _Py_isalnum__doc__[];
+extern const char _Py_isascii__doc__[];
+extern const char _Py_isdigit__doc__[];
+extern const char _Py_islower__doc__[];
+extern const char _Py_isupper__doc__[];
+extern const char _Py_istitle__doc__[];
+extern const char _Py_lower__doc__[];
+extern const char _Py_upper__doc__[];
+extern const char _Py_title__doc__[];
+extern const char _Py_capitalize__doc__[];
+extern const char _Py_swapcase__doc__[];
+extern const char _Py_count__doc__[];
+extern const char _Py_find__doc__[];
+extern const char _Py_index__doc__[];
+extern const char _Py_rfind__doc__[];
+extern const char _Py_rindex__doc__[];
+extern const char _Py_startswith__doc__[];
+extern const char _Py_endswith__doc__[];
+extern const char _Py_maketrans__doc__[];
+extern const char _Py_expandtabs__doc__[];
+extern const char _Py_ljust__doc__[];
+extern const char _Py_rjust__doc__[];
+extern const char _Py_center__doc__[];
+extern const char _Py_zfill__doc__[];
+
+/* this is needed because some docs are shared from the .o, not static */
+#define PyDoc_STRVAR_shared(name,str) const char name[] = PyDoc_STR(str)
+
+#endif /* !Py_BYTES_CTYPE_H */
+#endif /* !Py_LIMITED_API */
diff --git a/Include/internal/pycore_bytesobject.h b/Include/internal/pycore_bytesobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c922a4fb3037ae30b6aa5380dac229026eac263
--- /dev/null
+++ b/Include/internal/pycore_bytesobject.h
@@ -0,0 +1,152 @@
+#ifndef Py_INTERNAL_BYTESOBJECT_H
+#define Py_INTERNAL_BYTESOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern PyObject* _PyBytes_FormatEx(
+    const char *format,
+    Py_ssize_t format_len,
+    PyObject *args,
+    int use_bytearray);
+
+extern PyObject* _PyBytes_FromHex(
+    PyObject *string,
+    int use_bytearray);
+
+// Helper for PyBytes_DecodeEscape that detects invalid escape chars.
+// Export for test_peg_generator.
+PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
+                                             const char *,
+                                             int *, const char **);
+// Export for binary compatibility.
+PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
+                                            const char *, const char **);
+
+
+// Substring Search.
+//
+// Returns the index of the first occurrence of
+// a substring ("needle") in a larger text ("haystack").
+// If the needle is not found, return -1.
+// If the needle is found, add offset to the index.
+//
+// Export for 'mmap' shared extension.
+PyAPI_FUNC(Py_ssize_t)
+_PyBytes_Find(const char *haystack, Py_ssize_t len_haystack,
+              const char *needle, Py_ssize_t len_needle,
+              Py_ssize_t offset);
+
+// Same as above, but search right-to-left.
+// Export for 'mmap' shared extension.
+PyAPI_FUNC(Py_ssize_t)
+_PyBytes_ReverseFind(const char *haystack, Py_ssize_t len_haystack,
+                     const char *needle, Py_ssize_t len_needle,
+                     Py_ssize_t offset);
+
+
+// Helper function to implement the repeat and inplace repeat methods on a
+// buffer.
+//
+// len_dest is assumed to be an integer multiple of len_src.
+// If src equals dest, then assume the operation is inplace.
+//
+// This method repeately doubles the number of bytes copied to reduce
+// the number of invocations of memcpy.
+//
+// Export for 'array' shared extension.
+PyAPI_FUNC(void)
+_PyBytes_Repeat(char* dest, Py_ssize_t len_dest,
+    const char* src, Py_ssize_t len_src);
+
+/* --- _PyBytesWriter ----------------------------------------------------- */
+
+/* The _PyBytesWriter structure is big: it contains an embedded "stack buffer".
+   A _PyBytesWriter variable must be declared at the end of variables in a
+   function to optimize the memory allocation on the stack. */
+typedef struct {
+    /* bytes, bytearray or NULL (when the small buffer is used) */
+    PyObject *buffer;
+
+    /* Number of allocated size. */
+    Py_ssize_t allocated;
+
+    /* Minimum number of allocated bytes,
+       incremented by _PyBytesWriter_Prepare() */
+    Py_ssize_t min_size;
+
+    /* If non-zero, use a bytearray instead of a bytes object for buffer. */
+    int use_bytearray;
+
+    /* If non-zero, overallocate the buffer (default: 0).
+       This flag must be zero if use_bytearray is non-zero. */
+    int overallocate;
+
+    /* Stack buffer */
+    int use_small_buffer;
+    char small_buffer[512];
+} _PyBytesWriter;
+
+/* Initialize a bytes writer
+
+   By default, the overallocation is disabled. Set the overallocate attribute
+   to control the allocation of the buffer.
+
+   Export _PyBytesWriter API for '_pickle' shared extension. */
+PyAPI_FUNC(void) _PyBytesWriter_Init(_PyBytesWriter *writer);
+
+/* Get the buffer content and reset the writer.
+   Return a bytes object, or a bytearray object if use_bytearray is non-zero.
+   Raise an exception and return NULL on error. */
+PyAPI_FUNC(PyObject *) _PyBytesWriter_Finish(_PyBytesWriter *writer,
+    void *str);
+
+/* Deallocate memory of a writer (clear its internal buffer). */
+PyAPI_FUNC(void) _PyBytesWriter_Dealloc(_PyBytesWriter *writer);
+
+/* Allocate the buffer to write size bytes.
+   Return the pointer to the beginning of buffer data.
+   Raise an exception and return NULL on error. */
+PyAPI_FUNC(void*) _PyBytesWriter_Alloc(_PyBytesWriter *writer,
+    Py_ssize_t size);
+
+/* Ensure that the buffer is large enough to write *size* bytes.
+   Add size to the writer minimum size (min_size attribute).
+
+   str is the current pointer inside the buffer.
+   Return the updated current pointer inside the buffer.
+   Raise an exception and return NULL on error. */
+PyAPI_FUNC(void*) _PyBytesWriter_Prepare(_PyBytesWriter *writer,
+    void *str,
+    Py_ssize_t size);
+
+/* Resize the buffer to make it larger.
+   The new buffer may be larger than size bytes because of overallocation.
+   Return the updated current pointer inside the buffer.
+   Raise an exception and return NULL on error.
+
+   Note: size must be greater than the number of allocated bytes in the writer.
+
+   This function doesn't use the writer minimum size (min_size attribute).
+
+   See also _PyBytesWriter_Prepare().
+   */
+PyAPI_FUNC(void*) _PyBytesWriter_Resize(_PyBytesWriter *writer,
+    void *str,
+    Py_ssize_t size);
+
+/* Write bytes.
+   Raise an exception and return NULL on error. */
+PyAPI_FUNC(void*) _PyBytesWriter_WriteBytes(_PyBytesWriter *writer,
+    void *str,
+    const void *bytes,
+    Py_ssize_t size);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_BYTESOBJECT_H */
diff --git a/Include/internal/pycore_call.h b/Include/internal/pycore_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..c92028a01299e2d8090b37fe85947678579aca3e
--- /dev/null
+++ b/Include/internal/pycore_call.h
@@ -0,0 +1,205 @@
+#ifndef Py_INTERNAL_CALL_H
+#define Py_INTERNAL_CALL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_identifier.h"    // _Py_Identifier
+#include "pycore_pystate.h"       // _PyThreadState_GET()
+
+/* Suggested size (number of positional arguments) for arrays of PyObject*
+   allocated on a C stack to avoid allocating memory on the heap memory. Such
+   array is used to pass positional arguments to call functions of the
+   PyObject_Vectorcall() family.
+
+   The size is chosen to not abuse the C stack and so limit the risk of stack
+   overflow. The size is also chosen to allow using the small stack for most
+   function calls of the Python standard library. On 64-bit CPU, it allocates
+   40 bytes on the stack. */
+#define _PY_FASTCALL_SMALL_STACK 5
+
+
+// Export for 'math' shared extension, used via _PyObject_VectorcallTstate()
+// static inline function.
+PyAPI_FUNC(PyObject*) _Py_CheckFunctionResult(
+    PyThreadState *tstate,
+    PyObject *callable,
+    PyObject *result,
+    const char *where);
+
+extern PyObject* _PyObject_Call_Prepend(
+    PyThreadState *tstate,
+    PyObject *callable,
+    PyObject *obj,
+    PyObject *args,
+    PyObject *kwargs);
+
+extern PyObject* _PyObject_VectorcallDictTstate(
+    PyThreadState *tstate,
+    PyObject *callable,
+    PyObject *const *args,
+    size_t nargsf,
+    PyObject *kwargs);
+
+extern PyObject* _PyObject_Call(
+    PyThreadState *tstate,
+    PyObject *callable,
+    PyObject *args,
+    PyObject *kwargs);
+
+extern PyObject * _PyObject_CallMethodFormat(
+    PyThreadState *tstate,
+    PyObject *callable,
+    const char *format,
+    ...);
+
+// Export for 'array' shared extension
+PyAPI_FUNC(PyObject*) _PyObject_CallMethod(
+    PyObject *obj,
+    PyObject *name,
+    const char *format, ...);
+
+extern PyObject* _PyObject_CallMethodIdObjArgs(
+    PyObject *obj,
+    _Py_Identifier *name,
+    ...);
+
+static inline PyObject *
+_PyObject_VectorcallMethodId(
+    _Py_Identifier *name, PyObject *const *args,
+    size_t nargsf, PyObject *kwnames)
+{
+    PyObject *oname = _PyUnicode_FromId(name); /* borrowed */
+    if (!oname) {
+        return _Py_NULL;
+    }
+    return PyObject_VectorcallMethod(oname, args, nargsf, kwnames);
+}
+
+static inline PyObject *
+_PyObject_CallMethodIdNoArgs(PyObject *self, _Py_Identifier *name)
+{
+    size_t nargsf = 1 | PY_VECTORCALL_ARGUMENTS_OFFSET;
+    return _PyObject_VectorcallMethodId(name, &self, nargsf, _Py_NULL);
+}
+
+static inline PyObject *
+_PyObject_CallMethodIdOneArg(PyObject *self, _Py_Identifier *name, PyObject *arg)
+{
+    PyObject *args[2] = {self, arg};
+    size_t nargsf = 2 | PY_VECTORCALL_ARGUMENTS_OFFSET;
+    assert(arg != NULL);
+    return _PyObject_VectorcallMethodId(name, args, nargsf, _Py_NULL);
+}
+
+
+/* === Vectorcall protocol (PEP 590) ============================= */
+
+// Call callable using tp_call. Arguments are like PyObject_Vectorcall(),
+// except that nargs is plainly the number of arguments without flags.
+//
+// Export for 'math' shared extension, used via _PyObject_VectorcallTstate()
+// static inline function.
+PyAPI_FUNC(PyObject*) _PyObject_MakeTpCall(
+    PyThreadState *tstate,
+    PyObject *callable,
+    PyObject *const *args, Py_ssize_t nargs,
+    PyObject *keywords);
+
+// Static inline variant of public PyVectorcall_Function().
+static inline vectorcallfunc
+_PyVectorcall_FunctionInline(PyObject *callable)
+{
+    assert(callable != NULL);
+
+    PyTypeObject *tp = Py_TYPE(callable);
+    if (!PyType_HasFeature(tp, Py_TPFLAGS_HAVE_VECTORCALL)) {
+        return NULL;
+    }
+    assert(PyCallable_Check(callable));
+
+    Py_ssize_t offset = tp->tp_vectorcall_offset;
+    assert(offset > 0);
+
+    vectorcallfunc ptr;
+    memcpy(&ptr, (char *) callable + offset, sizeof(ptr));
+    return ptr;
+}
+
+
+/* Call the callable object 'callable' with the "vectorcall" calling
+   convention.
+
+   args is a C array for positional arguments.
+
+   nargsf is the number of positional arguments plus optionally the flag
+   PY_VECTORCALL_ARGUMENTS_OFFSET which means that the caller is allowed to
+   modify args[-1].
+
+   kwnames is a tuple of keyword names. The values of the keyword arguments
+   are stored in "args" after the positional arguments (note that the number
+   of keyword arguments does not change nargsf). kwnames can also be NULL if
+   there are no keyword arguments.
+
+   keywords must only contain strings and all keys must be unique.
+
+   Return the result on success. Raise an exception and return NULL on
+   error. */
+static inline PyObject *
+_PyObject_VectorcallTstate(PyThreadState *tstate, PyObject *callable,
+                           PyObject *const *args, size_t nargsf,
+                           PyObject *kwnames)
+{
+    vectorcallfunc func;
+    PyObject *res;
+
+    assert(kwnames == NULL || PyTuple_Check(kwnames));
+    assert(args != NULL || PyVectorcall_NARGS(nargsf) == 0);
+
+    func = _PyVectorcall_FunctionInline(callable);
+    if (func == NULL) {
+        Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+        return _PyObject_MakeTpCall(tstate, callable, args, nargs, kwnames);
+    }
+    res = func(callable, args, nargsf, kwnames);
+    return _Py_CheckFunctionResult(tstate, callable, res, NULL);
+}
+
+
+static inline PyObject *
+_PyObject_CallNoArgsTstate(PyThreadState *tstate, PyObject *func) {
+    return _PyObject_VectorcallTstate(tstate, func, NULL, 0, NULL);
+}
+
+
+// Private static inline function variant of public PyObject_CallNoArgs()
+static inline PyObject *
+_PyObject_CallNoArgs(PyObject *func) {
+    EVAL_CALL_STAT_INC_IF_FUNCTION(EVAL_CALL_API, func);
+    PyThreadState *tstate = _PyThreadState_GET();
+    return _PyObject_VectorcallTstate(tstate, func, NULL, 0, NULL);
+}
+
+
+extern PyObject *const *
+_PyStack_UnpackDict(PyThreadState *tstate,
+    PyObject *const *args, Py_ssize_t nargs,
+    PyObject *kwargs, PyObject **p_kwnames);
+
+extern void _PyStack_UnpackDict_Free(
+    PyObject *const *stack,
+    Py_ssize_t nargs,
+    PyObject *kwnames);
+
+extern void _PyStack_UnpackDict_FreeNoDecRef(
+    PyObject *const *stack,
+    PyObject *kwnames);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CALL_H */
diff --git a/Include/internal/pycore_capsule.h b/Include/internal/pycore_capsule.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa2c67f3a8f002ccef0e9b32fe4ac0cf8271164a
--- /dev/null
+++ b/Include/internal/pycore_capsule.h
@@ -0,0 +1,17 @@
+#ifndef Py_INTERNAL_PYCAPSULE_H
+#define Py_INTERNAL_PYCAPSULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Export for '_socket' shared extension
+PyAPI_FUNC(int) _PyCapsule_SetTraverse(PyObject *op, traverseproc traverse_func, inquiry clear_func);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYCAPSULE_H */
diff --git a/Include/internal/pycore_cell.h b/Include/internal/pycore_cell.h
new file mode 100644
index 0000000000000000000000000000000000000000..27f67d57b2fb794296b4ea86e5497a7d5a6fdc63
--- /dev/null
+++ b/Include/internal/pycore_cell.h
@@ -0,0 +1,48 @@
+#ifndef Py_INTERNAL_CELL_H
+#define Py_INTERNAL_CELL_H
+
+#include "pycore_critical_section.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Sets the cell contents to `value` and return previous contents. Steals a
+// reference to `value`.
+static inline PyObject *
+PyCell_SwapTakeRef(PyCellObject *cell, PyObject *value)
+{
+    PyObject *old_value;
+    Py_BEGIN_CRITICAL_SECTION(cell);
+    old_value = cell->ob_ref;
+    cell->ob_ref = value;
+    Py_END_CRITICAL_SECTION();
+    return old_value;
+}
+
+static inline void
+PyCell_SetTakeRef(PyCellObject *cell, PyObject *value)
+{
+    PyObject *old_value = PyCell_SwapTakeRef(cell, value);
+    Py_XDECREF(old_value);
+}
+
+// Gets the cell contents. Returns a new reference.
+static inline PyObject *
+PyCell_GetRef(PyCellObject *cell)
+{
+    PyObject *res;
+    Py_BEGIN_CRITICAL_SECTION(cell);
+    res = Py_XNewRef(cell->ob_ref);
+    Py_END_CRITICAL_SECTION();
+    return res;
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_CELL_H */
diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
new file mode 100644
index 0000000000000000000000000000000000000000..043f5957d481e5ea15cf054299d46055662099cd
--- /dev/null
+++ b/Include/internal/pycore_ceval.h
@@ -0,0 +1,303 @@
+#ifndef Py_INTERNAL_CEVAL_H
+#define Py_INTERNAL_CEVAL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "dynamic_annotations.h" // _Py_ANNOTATE_RWLOCK_CREATE
+
+#include "pycore_interp.h"        // PyInterpreterState.eval_frame
+#include "pycore_pystate.h"       // _PyThreadState_GET()
+
+/* Forward declarations */
+struct pyruntimestate;
+struct _ceval_runtime_state;
+
+// Export for '_lsprof' shared extension
+PyAPI_FUNC(int) _PyEval_SetProfile(PyThreadState *tstate, Py_tracefunc func, PyObject *arg);
+
+extern int _PyEval_SetTrace(PyThreadState *tstate, Py_tracefunc func, PyObject *arg);
+
+extern int _PyEval_SetOpcodeTrace(PyFrameObject *f, bool enable);
+
+// Helper to look up a builtin object
+// Export for 'array' shared extension
+PyAPI_FUNC(PyObject*) _PyEval_GetBuiltin(PyObject *);
+
+extern PyObject* _PyEval_GetBuiltinId(_Py_Identifier *);
+
+extern void _PyEval_SetSwitchInterval(unsigned long microseconds);
+extern unsigned long _PyEval_GetSwitchInterval(void);
+
+// Export for '_queue' shared extension
+PyAPI_FUNC(int) _PyEval_MakePendingCalls(PyThreadState *);
+
+#ifndef Py_DEFAULT_RECURSION_LIMIT
+#  define Py_DEFAULT_RECURSION_LIMIT 1000
+#endif
+
+extern void _Py_FinishPendingCalls(PyThreadState *tstate);
+extern void _PyEval_InitState(PyInterpreterState *);
+extern void _PyEval_SignalReceived(void);
+
+// bitwise flags:
+#define _Py_PENDING_MAINTHREADONLY 1
+#define _Py_PENDING_RAWFREE 2
+
+typedef int _Py_add_pending_call_result;
+#define _Py_ADD_PENDING_SUCCESS 0
+#define _Py_ADD_PENDING_FULL -1
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(_Py_add_pending_call_result) _PyEval_AddPendingCall(
+    PyInterpreterState *interp,
+    _Py_pending_call_func func,
+    void *arg,
+    int flags);
+
+#ifdef HAVE_FORK
+extern PyStatus _PyEval_ReInitThreads(PyThreadState *tstate);
+#endif
+
+// Used by sys.call_tracing()
+extern PyObject* _PyEval_CallTracing(PyObject *func, PyObject *args);
+
+// Used by sys.get_asyncgen_hooks()
+extern PyObject* _PyEval_GetAsyncGenFirstiter(void);
+extern PyObject* _PyEval_GetAsyncGenFinalizer(void);
+
+// Used by sys.set_asyncgen_hooks()
+extern int _PyEval_SetAsyncGenFirstiter(PyObject *);
+extern int _PyEval_SetAsyncGenFinalizer(PyObject *);
+
+// Used by sys.get_coroutine_origin_tracking_depth()
+// and sys.set_coroutine_origin_tracking_depth()
+extern int _PyEval_GetCoroutineOriginTrackingDepth(void);
+extern int _PyEval_SetCoroutineOriginTrackingDepth(int depth);
+
+extern void _PyEval_Fini(void);
+
+
+extern PyObject* _PyEval_GetBuiltins(PyThreadState *tstate);
+extern PyObject* _PyEval_BuiltinsFromGlobals(
+    PyThreadState *tstate,
+    PyObject *globals);
+
+// Trampoline API
+
+typedef struct {
+    // Callback to initialize the trampoline state
+    void* (*init_state)(void);
+    // Callback to register every trampoline being created
+    void (*write_state)(void* state, const void *code_addr,
+                        unsigned int code_size, PyCodeObject* code);
+    // Callback to free the trampoline state
+    int (*free_state)(void* state);
+} _PyPerf_Callbacks;
+
+extern int _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *);
+extern void _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *);
+extern int _PyPerfTrampoline_Init(int activate);
+extern int _PyPerfTrampoline_Fini(void);
+extern void _PyPerfTrampoline_FreeArenas(void);
+extern int _PyIsPerfTrampolineActive(void);
+extern PyStatus _PyPerfTrampoline_AfterFork_Child(void);
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+extern _PyPerf_Callbacks _Py_perfmap_callbacks;
+extern _PyPerf_Callbacks _Py_perfmap_jit_callbacks;
+#endif
+
+static inline PyObject*
+_PyEval_EvalFrame(PyThreadState *tstate, struct _PyInterpreterFrame *frame, int throwflag)
+{
+    EVAL_CALL_STAT_INC(EVAL_CALL_TOTAL);
+    if (tstate->interp->eval_frame == NULL) {
+        return _PyEval_EvalFrameDefault(tstate, frame, throwflag);
+    }
+    return tstate->interp->eval_frame(tstate, frame, throwflag);
+}
+
+extern PyObject*
+_PyEval_Vector(PyThreadState *tstate,
+            PyFunctionObject *func, PyObject *locals,
+            PyObject* const* args, size_t argcount,
+            PyObject *kwnames);
+
+extern int _PyEval_ThreadsInitialized(void);
+extern void _PyEval_InitGIL(PyThreadState *tstate, int own_gil);
+extern void _PyEval_FiniGIL(PyInterpreterState *interp);
+
+extern void _PyEval_AcquireLock(PyThreadState *tstate);
+
+extern void _PyEval_ReleaseLock(PyInterpreterState *, PyThreadState *,
+                                int final_release);
+
+#ifdef Py_GIL_DISABLED
+// Returns 0 or 1 if the GIL for the given thread's interpreter is disabled or
+// enabled, respectively.
+//
+// The enabled state of the GIL will not change while one or more threads are
+// attached.
+static inline int
+_PyEval_IsGILEnabled(PyThreadState *tstate)
+{
+    struct _gil_runtime_state *gil = tstate->interp->ceval.gil;
+    return _Py_atomic_load_int_relaxed(&gil->enabled) != 0;
+}
+
+// Enable or disable the GIL used by the interpreter that owns tstate, which
+// must be the current thread. This may affect other interpreters, if the GIL
+// is shared. All three functions will be no-ops (and return 0) if the
+// interpreter's `enable_gil' config is not _PyConfig_GIL_DEFAULT.
+//
+// Every call to _PyEval_EnableGILTransient() must be paired with exactly one
+// call to either _PyEval_EnableGILPermanent() or
+// _PyEval_DisableGIL(). _PyEval_EnableGILPermanent() and _PyEval_DisableGIL()
+// must only be called while the GIL is enabled from a call to
+// _PyEval_EnableGILTransient().
+//
+// _PyEval_EnableGILTransient() returns 1 if it enabled the GIL, or 0 if the
+// GIL was already enabled, whether transiently or permanently. The caller will
+// hold the GIL upon return.
+//
+// _PyEval_EnableGILPermanent() returns 1 if it permanently enabled the GIL
+// (which must already be enabled), or 0 if it was already permanently
+// enabled. Once _PyEval_EnableGILPermanent() has been called once, all
+// subsequent calls to any of the three functions will be no-ops.
+//
+// _PyEval_DisableGIL() returns 1 if it disabled the GIL, or 0 if the GIL was
+// kept enabled because of another request, whether transient or permanent.
+//
+// All three functions must be called by an attached thread (this implies that
+// if the GIL is enabled, the current thread must hold it).
+extern int _PyEval_EnableGILTransient(PyThreadState *tstate);
+extern int _PyEval_EnableGILPermanent(PyThreadState *tstate);
+extern int _PyEval_DisableGIL(PyThreadState *state);
+#endif
+
+extern void _PyEval_DeactivateOpCache(void);
+
+
+/* --- _Py_EnterRecursiveCall() ----------------------------------------- */
+
+#ifdef USE_STACKCHECK
+/* With USE_STACKCHECK macro defined, trigger stack checks in
+   _Py_CheckRecursiveCall() on every 64th call to _Py_EnterRecursiveCall. */
+static inline int _Py_MakeRecCheck(PyThreadState *tstate)  {
+    return (tstate->c_recursion_remaining-- < 0
+            || (tstate->c_recursion_remaining & 63) == 0);
+}
+#else
+static inline int _Py_MakeRecCheck(PyThreadState *tstate) {
+    return tstate->c_recursion_remaining-- < 0;
+}
+#endif
+
+// Export for '_json' shared extension, used via _Py_EnterRecursiveCall()
+// static inline function.
+PyAPI_FUNC(int) _Py_CheckRecursiveCall(
+    PyThreadState *tstate,
+    const char *where);
+
+int _Py_CheckRecursiveCallPy(
+    PyThreadState *tstate);
+
+static inline int _Py_EnterRecursiveCallTstate(PyThreadState *tstate,
+                                               const char *where) {
+    return (_Py_MakeRecCheck(tstate) && _Py_CheckRecursiveCall(tstate, where));
+}
+
+static inline void _Py_EnterRecursiveCallTstateUnchecked(PyThreadState *tstate)  {
+    assert(tstate->c_recursion_remaining > 0);
+    tstate->c_recursion_remaining--;
+}
+
+static inline int _Py_EnterRecursiveCall(const char *where) {
+    PyThreadState *tstate = _PyThreadState_GET();
+    return _Py_EnterRecursiveCallTstate(tstate, where);
+}
+
+static inline void _Py_LeaveRecursiveCallTstate(PyThreadState *tstate)  {
+    tstate->c_recursion_remaining++;
+}
+
+static inline void _Py_LeaveRecursiveCall(void)  {
+    PyThreadState *tstate = _PyThreadState_GET();
+    _Py_LeaveRecursiveCallTstate(tstate);
+}
+
+extern struct _PyInterpreterFrame* _PyEval_GetFrame(void);
+
+PyAPI_FUNC(PyObject *)_Py_MakeCoro(PyFunctionObject *func);
+
+/* Handle signals, pending calls, GIL drop request
+   and asynchronous exception */
+PyAPI_FUNC(int) _Py_HandlePending(PyThreadState *tstate);
+
+extern PyObject * _PyEval_GetFrameLocals(void);
+
+typedef PyObject *(*conversion_func)(PyObject *);
+
+PyAPI_DATA(const binaryfunc) _PyEval_BinaryOps[];
+PyAPI_DATA(const conversion_func) _PyEval_ConversionFuncs[];
+
+PyAPI_FUNC(int) _PyEval_CheckExceptStarTypeValid(PyThreadState *tstate, PyObject* right);
+PyAPI_FUNC(int) _PyEval_CheckExceptTypeValid(PyThreadState *tstate, PyObject* right);
+PyAPI_FUNC(int) _PyEval_ExceptionGroupMatch(PyObject* exc_value, PyObject *match_type, PyObject **match, PyObject **rest);
+PyAPI_FUNC(void) _PyEval_FormatAwaitableError(PyThreadState *tstate, PyTypeObject *type, int oparg);
+PyAPI_FUNC(void) _PyEval_FormatExcCheckArg(PyThreadState *tstate, PyObject *exc, const char *format_str, PyObject *obj);
+PyAPI_FUNC(void) _PyEval_FormatExcUnbound(PyThreadState *tstate, PyCodeObject *co, int oparg);
+PyAPI_FUNC(void) _PyEval_FormatKwargsError(PyThreadState *tstate, PyObject *func, PyObject *kwargs);
+PyAPI_FUNC(PyObject *)_PyEval_MatchClass(PyThreadState *tstate, PyObject *subject, PyObject *type, Py_ssize_t nargs, PyObject *kwargs);
+PyAPI_FUNC(PyObject *)_PyEval_MatchKeys(PyThreadState *tstate, PyObject *map, PyObject *keys);
+PyAPI_FUNC(int) _PyEval_UnpackIterable(PyThreadState *tstate, PyObject *v, int argcnt, int argcntafter, PyObject **sp);
+PyAPI_FUNC(void) _PyEval_MonitorRaise(PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *instr);
+PyAPI_FUNC(void) _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame);
+
+
+/* Bits that can be set in PyThreadState.eval_breaker */
+#define _PY_GIL_DROP_REQUEST_BIT (1U << 0)
+#define _PY_SIGNALS_PENDING_BIT (1U << 1)
+#define _PY_CALLS_TO_DO_BIT (1U << 2)
+#define _PY_ASYNC_EXCEPTION_BIT (1U << 3)
+#define _PY_GC_SCHEDULED_BIT (1U << 4)
+#define _PY_EVAL_PLEASE_STOP_BIT (1U << 5)
+#define _PY_EVAL_EXPLICIT_MERGE_BIT (1U << 6)
+
+/* Reserve a few bits for future use */
+#define _PY_EVAL_EVENTS_BITS 8
+#define _PY_EVAL_EVENTS_MASK ((1 << _PY_EVAL_EVENTS_BITS)-1)
+
+static inline void
+_Py_set_eval_breaker_bit(PyThreadState *tstate, uintptr_t bit)
+{
+    _Py_atomic_or_uintptr(&tstate->eval_breaker, bit);
+}
+
+static inline void
+_Py_unset_eval_breaker_bit(PyThreadState *tstate, uintptr_t bit)
+{
+    _Py_atomic_and_uintptr(&tstate->eval_breaker, ~bit);
+}
+
+static inline int
+_Py_eval_breaker_bit_is_set(PyThreadState *tstate, uintptr_t bit)
+{
+    uintptr_t b = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker);
+    return (b & bit) != 0;
+}
+
+// Free-threaded builds use these functions to set or unset a bit on all
+// threads in the given interpreter.
+void _Py_set_eval_breaker_bit_all(PyInterpreterState *interp, uintptr_t bit);
+void _Py_unset_eval_breaker_bit_all(PyInterpreterState *interp, uintptr_t bit);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CEVAL_H */
diff --git a/Include/internal/pycore_ceval_state.h b/Include/internal/pycore_ceval_state.h
new file mode 100644
index 0000000000000000000000000000000000000000..009a1ea41eb9857d84fc2440db1376196f88c695
--- /dev/null
+++ b/Include/internal/pycore_ceval_state.h
@@ -0,0 +1,134 @@
+#ifndef Py_INTERNAL_CEVAL_STATE_H
+#define Py_INTERNAL_CEVAL_STATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_lock.h"            // PyMutex
+#include "pycore_gil.h"             // struct _gil_runtime_state
+
+
+typedef int (*_Py_pending_call_func)(void *);
+
+struct _pending_call {
+    _Py_pending_call_func func;
+    void *arg;
+    int flags;
+};
+
+#define PENDINGCALLSARRAYSIZE 300
+
+#define MAXPENDINGCALLS PENDINGCALLSARRAYSIZE
+/* For interpreter-level pending calls, we want to avoid spending too
+   much time on pending calls in any one thread, so we apply a limit. */
+#if MAXPENDINGCALLS > 100
+#  define MAXPENDINGCALLSLOOP 100
+#else
+#  define MAXPENDINGCALLSLOOP MAXPENDINGCALLS
+#endif
+
+/* We keep the number small to preserve as much compatibility
+   as possible with earlier versions. */
+#define MAXPENDINGCALLS_MAIN 32
+/* For the main thread, we want to make sure all pending calls are
+   run at once, for the sake of prompt signal handling.  This is
+   unlikely to cause any problems since there should be very few
+   pending calls for the main thread. */
+#define MAXPENDINGCALLSLOOP_MAIN 0
+
+struct _pending_calls {
+    PyThreadState *handling_thread;
+    PyMutex mutex;
+    /* Request for running pending calls. */
+    int32_t npending;
+    /* The maximum allowed number of pending calls.
+       If the queue fills up to this point then _PyEval_AddPendingCall()
+       will return _Py_ADD_PENDING_FULL. */
+    int32_t max;
+    /* We don't want a flood of pending calls to interrupt any one thread
+       for too long, so we keep a limit on the number handled per pass.
+       A value of 0 means there is no limit (other than the maximum
+       size of the list of pending calls). */
+    int32_t maxloop;
+    struct _pending_call calls[PENDINGCALLSARRAYSIZE];
+    int first;
+    int next;
+};
+
+
+typedef enum {
+    PERF_STATUS_FAILED = -1,  // Perf trampoline is in an invalid state
+    PERF_STATUS_NO_INIT = 0,  // Perf trampoline is not initialized
+    PERF_STATUS_OK = 1,       // Perf trampoline is ready to be executed
+} perf_status_t;
+
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+struct code_arena_st;
+
+struct trampoline_api_st {
+    void* (*init_state)(void);
+    void (*write_state)(void* state, const void *code_addr,
+                        unsigned int code_size, PyCodeObject* code);
+    int (*free_state)(void* state);
+    void *state;
+    Py_ssize_t code_padding;
+};
+#endif
+
+
+struct _ceval_runtime_state {
+    struct {
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+        perf_status_t status;
+        int perf_trampoline_type;
+        Py_ssize_t extra_code_index;
+        struct code_arena_st *code_arena;
+        struct trampoline_api_st trampoline_api;
+        FILE *map_file;
+        Py_ssize_t persist_after_fork;
+#else
+        int _not_used;
+#endif
+    } perf;
+    /* Pending calls to be made only on the main thread. */
+    // The signal machinery falls back on this
+    // so it must be especially stable and efficient.
+    // For example, we use a preallocated array
+    // for the list of pending calls.
+    struct _pending_calls pending_mainthread;
+    PyMutex sys_trace_profile_mutex;
+};
+
+
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+# define _PyEval_RUNTIME_PERF_INIT \
+    { \
+        .status = PERF_STATUS_NO_INIT, \
+        .extra_code_index = -1, \
+        .persist_after_fork = 0, \
+    }
+#else
+# define _PyEval_RUNTIME_PERF_INIT {0}
+#endif
+
+
+struct _ceval_state {
+    /* This variable holds the global instrumentation version. When a thread is
+       running, this value is overlaid onto PyThreadState.eval_breaker so that
+       changes in the instrumentation version will trigger the eval breaker. */
+    uintptr_t instrumentation_version;
+    int recursion_limit;
+    struct _gil_runtime_state *gil;
+    int own_gil;
+    struct _pending_calls pending;
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CEVAL_STATE_H */
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fb8cc473c6e98b942d1a083f9fd96cb40f98004
--- /dev/null
+++ b/Include/internal/pycore_code.h
@@ -0,0 +1,596 @@
+#ifndef Py_INTERNAL_CODE_H
+#define Py_INTERNAL_CODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_lock.h"        // PyMutex
+#include "pycore_backoff.h"     // _Py_BackoffCounter
+
+
+/* Each instruction in a code object is a fixed-width value,
+ * currently 2 bytes: 1-byte opcode + 1-byte oparg.  The EXTENDED_ARG
+ * opcode allows for larger values but the current limit is 3 uses
+ * of EXTENDED_ARG (see Python/compile.c), for a maximum
+ * 32-bit value.  This aligns with the note in Python/compile.c
+ * (compiler_addop_i_line) indicating that the max oparg value is
+ * 2**32 - 1, rather than INT_MAX.
+ */
+
+typedef union {
+    uint16_t cache;
+    struct {
+        uint8_t code;
+        uint8_t arg;
+    } op;
+    _Py_BackoffCounter counter;  // First cache entry of specializable op
+} _Py_CODEUNIT;
+
+#define _PyCode_CODE(CO) _Py_RVALUE((_Py_CODEUNIT *)(CO)->co_code_adaptive)
+#define _PyCode_NBYTES(CO) (Py_SIZE(CO) * (Py_ssize_t)sizeof(_Py_CODEUNIT))
+
+
+/* These macros only remain defined for compatibility. */
+#define _Py_OPCODE(word) ((word).op.code)
+#define _Py_OPARG(word) ((word).op.arg)
+
+static inline _Py_CODEUNIT
+_py_make_codeunit(uint8_t opcode, uint8_t oparg)
+{
+    // No designated initialisers because of C++ compat
+    _Py_CODEUNIT word;
+    word.op.code = opcode;
+    word.op.arg = oparg;
+    return word;
+}
+
+static inline void
+_py_set_opcode(_Py_CODEUNIT *word, uint8_t opcode)
+{
+    word->op.code = opcode;
+}
+
+#define _Py_MAKE_CODEUNIT(opcode, oparg) _py_make_codeunit((opcode), (oparg))
+#define _Py_SET_OPCODE(word, opcode) _py_set_opcode(&(word), (opcode))
+
+
+// We hide some of the newer PyCodeObject fields behind macros.
+// This helps with backporting certain changes to 3.12.
+#define _PyCode_HAS_EXECUTORS(CODE) \
+    (CODE->co_executors != NULL)
+#define _PyCode_HAS_INSTRUMENTATION(CODE) \
+    (CODE->_co_instrumentation_version > 0)
+
+struct _py_code_state {
+    PyMutex mutex;
+    // Interned constants from code objects. Used by the free-threaded build.
+    struct _Py_hashtable_t *constants;
+};
+
+extern PyStatus _PyCode_Init(PyInterpreterState *interp);
+extern void _PyCode_Fini(PyInterpreterState *interp);
+
+#define CODE_MAX_WATCHERS 8
+
+/* PEP 659
+ * Specialization and quickening structs and helper functions
+ */
+
+
+// Inline caches. If you change the number of cache entries for an instruction,
+// you must *also* update the number of cache entries in Lib/opcode.py and bump
+// the magic number in Lib/importlib/_bootstrap_external.py!
+
+#define CACHE_ENTRIES(cache) (sizeof(cache)/sizeof(_Py_CODEUNIT))
+
+typedef struct {
+    _Py_BackoffCounter counter;
+    uint16_t module_keys_version;
+    uint16_t builtin_keys_version;
+    uint16_t index;
+} _PyLoadGlobalCache;
+
+#define INLINE_CACHE_ENTRIES_LOAD_GLOBAL CACHE_ENTRIES(_PyLoadGlobalCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+} _PyBinaryOpCache;
+
+#define INLINE_CACHE_ENTRIES_BINARY_OP CACHE_ENTRIES(_PyBinaryOpCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+} _PyUnpackSequenceCache;
+
+#define INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE \
+    CACHE_ENTRIES(_PyUnpackSequenceCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+} _PyCompareOpCache;
+
+#define INLINE_CACHE_ENTRIES_COMPARE_OP CACHE_ENTRIES(_PyCompareOpCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+} _PyBinarySubscrCache;
+
+#define INLINE_CACHE_ENTRIES_BINARY_SUBSCR CACHE_ENTRIES(_PyBinarySubscrCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+} _PySuperAttrCache;
+
+#define INLINE_CACHE_ENTRIES_LOAD_SUPER_ATTR CACHE_ENTRIES(_PySuperAttrCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+    uint16_t version[2];
+    uint16_t index;
+} _PyAttrCache;
+
+typedef struct {
+    _Py_BackoffCounter counter;
+    uint16_t type_version[2];
+    union {
+        uint16_t keys_version[2];
+        uint16_t dict_offset;
+    };
+    uint16_t descr[4];
+} _PyLoadMethodCache;
+
+
+// MUST be the max(_PyAttrCache, _PyLoadMethodCache)
+#define INLINE_CACHE_ENTRIES_LOAD_ATTR CACHE_ENTRIES(_PyLoadMethodCache)
+
+#define INLINE_CACHE_ENTRIES_STORE_ATTR CACHE_ENTRIES(_PyAttrCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+    uint16_t func_version[2];
+} _PyCallCache;
+
+#define INLINE_CACHE_ENTRIES_CALL CACHE_ENTRIES(_PyCallCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+} _PyStoreSubscrCache;
+
+#define INLINE_CACHE_ENTRIES_STORE_SUBSCR CACHE_ENTRIES(_PyStoreSubscrCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+} _PyForIterCache;
+
+#define INLINE_CACHE_ENTRIES_FOR_ITER CACHE_ENTRIES(_PyForIterCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+} _PySendCache;
+
+#define INLINE_CACHE_ENTRIES_SEND CACHE_ENTRIES(_PySendCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+    uint16_t version[2];
+} _PyToBoolCache;
+
+#define INLINE_CACHE_ENTRIES_TO_BOOL CACHE_ENTRIES(_PyToBoolCache)
+
+typedef struct {
+    _Py_BackoffCounter counter;
+} _PyContainsOpCache;
+
+#define INLINE_CACHE_ENTRIES_CONTAINS_OP CACHE_ENTRIES(_PyContainsOpCache)
+
+// Borrowed references to common callables:
+struct callable_cache {
+    PyObject *isinstance;
+    PyObject *len;
+    PyObject *list_append;
+    PyObject *object__getattribute__;
+};
+
+/* "Locals plus" for a code object is the set of locals + cell vars +
+ * free vars.  This relates to variable names as well as offsets into
+ * the "fast locals" storage array of execution frames.  The compiler
+ * builds the list of names, their offsets, and the corresponding
+ * kind of local.
+ *
+ * Those kinds represent the source of the initial value and the
+ * variable's scope (as related to closures).  A "local" is an
+ * argument or other variable defined in the current scope.  A "free"
+ * variable is one that is defined in an outer scope and comes from
+ * the function's closure.  A "cell" variable is a local that escapes
+ * into an inner function as part of a closure, and thus must be
+ * wrapped in a cell.  Any "local" can also be a "cell", but the
+ * "free" kind is mutually exclusive with both.
+ */
+
+// Note that these all fit within a byte, as do combinations.
+// Later, we will use the smaller numbers to differentiate the different
+// kinds of locals (e.g. pos-only arg, varkwargs, local-only).
+#define CO_FAST_HIDDEN  0x10
+#define CO_FAST_LOCAL   0x20
+#define CO_FAST_CELL    0x40
+#define CO_FAST_FREE    0x80
+
+typedef unsigned char _PyLocals_Kind;
+
+static inline _PyLocals_Kind
+_PyLocals_GetKind(PyObject *kinds, int i)
+{
+    assert(PyBytes_Check(kinds));
+    assert(0 <= i && i < PyBytes_GET_SIZE(kinds));
+    char *ptr = PyBytes_AS_STRING(kinds);
+    return (_PyLocals_Kind)(ptr[i]);
+}
+
+static inline void
+_PyLocals_SetKind(PyObject *kinds, int i, _PyLocals_Kind kind)
+{
+    assert(PyBytes_Check(kinds));
+    assert(0 <= i && i < PyBytes_GET_SIZE(kinds));
+    char *ptr = PyBytes_AS_STRING(kinds);
+    ptr[i] = (char) kind;
+}
+
+
+struct _PyCodeConstructor {
+    /* metadata */
+    PyObject *filename;
+    PyObject *name;
+    PyObject *qualname;
+    int flags;
+
+    /* the code */
+    PyObject *code;
+    int firstlineno;
+    PyObject *linetable;
+
+    /* used by the code */
+    PyObject *consts;
+    PyObject *names;
+
+    /* mapping frame offsets to information */
+    PyObject *localsplusnames;  // Tuple of strings
+    PyObject *localspluskinds;  // Bytes object, one byte per variable
+
+    /* args (within varnames) */
+    int argcount;
+    int posonlyargcount;
+    // XXX Replace argcount with posorkwargcount (argcount - posonlyargcount).
+    int kwonlyargcount;
+
+    /* needed to create the frame */
+    int stacksize;
+
+    /* used by the eval loop */
+    PyObject *exceptiontable;
+};
+
+// Using an "arguments struct" like this is helpful for maintainability
+// in a case such as this with many parameters.  It does bear a risk:
+// if the struct changes and callers are not updated properly then the
+// compiler will not catch problems (like a missing argument).  This can
+// cause hard-to-debug problems.  The risk is mitigated by the use of
+// check_code() in codeobject.c.  However, we may decide to switch
+// back to a regular function signature.  Regardless, this approach
+// wouldn't be appropriate if this weren't a strictly internal API.
+// (See the comments in https://github.com/python/cpython/pull/26258.)
+extern int _PyCode_Validate(struct _PyCodeConstructor *);
+extern PyCodeObject* _PyCode_New(struct _PyCodeConstructor *);
+
+
+/* Private API */
+
+/* Getters for internal PyCodeObject data. */
+extern PyObject* _PyCode_GetVarnames(PyCodeObject *);
+extern PyObject* _PyCode_GetCellvars(PyCodeObject *);
+extern PyObject* _PyCode_GetFreevars(PyCodeObject *);
+extern PyObject* _PyCode_GetCode(PyCodeObject *);
+
+/** API for initializing the line number tables. */
+extern int _PyCode_InitAddressRange(PyCodeObject* co, PyCodeAddressRange *bounds);
+
+/** Out of process API for initializing the location table. */
+extern void _PyLineTable_InitAddressRange(
+    const char *linetable,
+    Py_ssize_t length,
+    int firstlineno,
+    PyCodeAddressRange *range);
+
+/** API for traversing the line number table. */
+extern int _PyLineTable_NextAddressRange(PyCodeAddressRange *range);
+extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range);
+
+/** API for executors */
+extern void _PyCode_Clear_Executors(PyCodeObject *code);
+
+#ifdef Py_GIL_DISABLED
+// gh-115999 tracks progress on addressing this.
+#define ENABLE_SPECIALIZATION 0
+#else
+#define ENABLE_SPECIALIZATION 1
+#endif
+
+/* Specialization functions */
+
+extern void _Py_Specialize_LoadSuperAttr(PyObject *global_super, PyObject *cls,
+                                         _Py_CODEUNIT *instr, int load_method);
+extern void _Py_Specialize_LoadAttr(PyObject *owner, _Py_CODEUNIT *instr,
+                                    PyObject *name);
+extern void _Py_Specialize_StoreAttr(PyObject *owner, _Py_CODEUNIT *instr,
+                                     PyObject *name);
+extern void _Py_Specialize_LoadGlobal(PyObject *globals, PyObject *builtins,
+                                      _Py_CODEUNIT *instr, PyObject *name);
+extern void _Py_Specialize_BinarySubscr(PyObject *sub, PyObject *container,
+                                        _Py_CODEUNIT *instr);
+extern void _Py_Specialize_StoreSubscr(PyObject *container, PyObject *sub,
+                                       _Py_CODEUNIT *instr);
+extern void _Py_Specialize_Call(PyObject *callable, _Py_CODEUNIT *instr,
+                                int nargs);
+extern void _Py_Specialize_BinaryOp(PyObject *lhs, PyObject *rhs, _Py_CODEUNIT *instr,
+                                    int oparg, PyObject **locals);
+extern void _Py_Specialize_CompareOp(PyObject *lhs, PyObject *rhs,
+                                     _Py_CODEUNIT *instr, int oparg);
+extern void _Py_Specialize_UnpackSequence(PyObject *seq, _Py_CODEUNIT *instr,
+                                          int oparg);
+extern void _Py_Specialize_ForIter(PyObject *iter, _Py_CODEUNIT *instr, int oparg);
+extern void _Py_Specialize_Send(PyObject *receiver, _Py_CODEUNIT *instr);
+extern void _Py_Specialize_ToBool(PyObject *value, _Py_CODEUNIT *instr);
+extern void _Py_Specialize_ContainsOp(PyObject *value, _Py_CODEUNIT *instr);
+
+#ifdef Py_STATS
+
+#include "pycore_bitutils.h"  // _Py_bit_length
+
+#define STAT_INC(opname, name) do { if (_Py_stats) _Py_stats->opcode_stats[opname].specialization.name++; } while (0)
+#define STAT_DEC(opname, name) do { if (_Py_stats) _Py_stats->opcode_stats[opname].specialization.name--; } while (0)
+#define OPCODE_EXE_INC(opname) do { if (_Py_stats) _Py_stats->opcode_stats[opname].execution_count++; } while (0)
+#define CALL_STAT_INC(name) do { if (_Py_stats) _Py_stats->call_stats.name++; } while (0)
+#define OBJECT_STAT_INC(name) do { if (_Py_stats) _Py_stats->object_stats.name++; } while (0)
+#define OBJECT_STAT_INC_COND(name, cond) \
+    do { if (_Py_stats && cond) _Py_stats->object_stats.name++; } while (0)
+#define EVAL_CALL_STAT_INC(name) do { if (_Py_stats) _Py_stats->call_stats.eval_calls[name]++; } while (0)
+#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) \
+    do { if (_Py_stats && PyFunction_Check(callable)) _Py_stats->call_stats.eval_calls[name]++; } while (0)
+#define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0)
+#define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0)
+#define UOP_STAT_INC(opname, name) do { if (_Py_stats) { assert(opname < 512); _Py_stats->optimization_stats.opcode[opname].name++; } } while (0)
+#define UOP_PAIR_INC(uopcode, lastuop)                                              \
+    do {                                                                            \
+        if (lastuop && _Py_stats) {                                                 \
+            _Py_stats->optimization_stats.opcode[lastuop].pair_count[uopcode]++;    \
+        }                                                                           \
+        lastuop = uopcode;                                                          \
+    } while (0)
+#define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0)
+#define OPT_ERROR_IN_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.error_in_opcode[opname]++; } while (0)
+#define OPT_HIST(length, name) \
+    do { \
+        if (_Py_stats) { \
+            int bucket = _Py_bit_length(length >= 1 ? length - 1 : 0); \
+            bucket = (bucket >= _Py_UOP_HIST_SIZE) ? _Py_UOP_HIST_SIZE - 1 : bucket; \
+            _Py_stats->optimization_stats.name[bucket]++; \
+        } \
+    } while (0)
+#define RARE_EVENT_STAT_INC(name) do { if (_Py_stats) _Py_stats->rare_event_stats.name++; } while (0)
+
+// Export for '_opcode' shared extension
+PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
+
+#else
+#define STAT_INC(opname, name) ((void)0)
+#define STAT_DEC(opname, name) ((void)0)
+#define OPCODE_EXE_INC(opname) ((void)0)
+#define CALL_STAT_INC(name) ((void)0)
+#define OBJECT_STAT_INC(name) ((void)0)
+#define OBJECT_STAT_INC_COND(name, cond) ((void)0)
+#define EVAL_CALL_STAT_INC(name) ((void)0)
+#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
+#define GC_STAT_ADD(gen, name, n) ((void)0)
+#define OPT_STAT_INC(name) ((void)0)
+#define UOP_STAT_INC(opname, name) ((void)0)
+#define UOP_PAIR_INC(uopcode, lastuop) ((void)0)
+#define OPT_UNSUPPORTED_OPCODE(opname) ((void)0)
+#define OPT_ERROR_IN_OPCODE(opname) ((void)0)
+#define OPT_HIST(length, name) ((void)0)
+#define RARE_EVENT_STAT_INC(name) ((void)0)
+#endif  // !Py_STATS
+
+// Utility functions for reading/writing 32/64-bit values in the inline caches.
+// Great care should be taken to ensure that these functions remain correct and
+// performant! They should compile to just "move" instructions on all supported
+// compilers and platforms.
+
+// We use memcpy to let the C compiler handle unaligned accesses and endianness
+// issues for us. It also seems to produce better code than manual copying for
+// most compilers (see https://blog.regehr.org/archives/959 for more info).
+
+static inline void
+write_u32(uint16_t *p, uint32_t val)
+{
+    memcpy(p, &val, sizeof(val));
+}
+
+static inline void
+write_u64(uint16_t *p, uint64_t val)
+{
+    memcpy(p, &val, sizeof(val));
+}
+
+static inline void
+write_obj(uint16_t *p, PyObject *val)
+{
+    memcpy(p, &val, sizeof(val));
+}
+
+static inline uint16_t
+read_u16(uint16_t *p)
+{
+    return *p;
+}
+
+static inline uint32_t
+read_u32(uint16_t *p)
+{
+    uint32_t val;
+    memcpy(&val, p, sizeof(val));
+    return val;
+}
+
+static inline uint64_t
+read_u64(uint16_t *p)
+{
+    uint64_t val;
+    memcpy(&val, p, sizeof(val));
+    return val;
+}
+
+static inline PyObject *
+read_obj(uint16_t *p)
+{
+    PyObject *val;
+    memcpy(&val, p, sizeof(val));
+    return val;
+}
+
+/* See Objects/exception_handling_notes.txt for details.
+ */
+static inline unsigned char *
+parse_varint(unsigned char *p, int *result) {
+    int val = p[0] & 63;
+    while (p[0] & 64) {
+        p++;
+        val = (val << 6) | (p[0] & 63);
+    }
+    *result = val;
+    return p+1;
+}
+
+static inline int
+write_varint(uint8_t *ptr, unsigned int val)
+{
+    int written = 1;
+    while (val >= 64) {
+        *ptr++ = 64 | (val & 63);
+        val >>= 6;
+        written++;
+    }
+    *ptr = (uint8_t)val;
+    return written;
+}
+
+static inline int
+write_signed_varint(uint8_t *ptr, int val)
+{
+    unsigned int uval;
+    if (val < 0) {
+        // (unsigned int)(-val) has an undefined behavior for INT_MIN
+        uval = ((0 - (unsigned int)val) << 1) | 1;
+    }
+    else {
+        uval = (unsigned int)val << 1;
+    }
+    return write_varint(ptr, uval);
+}
+
+static inline int
+write_location_entry_start(uint8_t *ptr, int code, int length)
+{
+    assert((code & 15) == code);
+    *ptr = 128 | (uint8_t)(code << 3) | (uint8_t)(length - 1);
+    return 1;
+}
+
+
+/** Counters
+ * The first 16-bit value in each inline cache is a counter.
+ *
+ * When counting executions until the next specialization attempt,
+ * exponential backoff is used to reduce the number of specialization failures.
+ * See pycore_backoff.h for more details.
+ * On a specialization failure, the backoff counter is restarted.
+ */
+
+#include "pycore_backoff.h"
+
+// A value of 1 means that we attempt to specialize the *second* time each
+// instruction is executed. Executing twice is a much better indicator of
+// "hotness" than executing once, but additional warmup delays only prevent
+// specialization. Most types stabilize by the second execution, too:
+#define ADAPTIVE_WARMUP_VALUE 1
+#define ADAPTIVE_WARMUP_BACKOFF 1
+
+// A value of 52 means that we attempt to re-specialize after 53 misses (a prime
+// number, useful for avoiding artifacts if every nth value is a different type
+// or something). Setting the backoff to 0 means that the counter is reset to
+// the same state as a warming-up instruction (value == 1, backoff == 1) after
+// deoptimization. This isn't strictly necessary, but it is bit easier to reason
+// about when thinking about the opcode transitions as a state machine:
+#define ADAPTIVE_COOLDOWN_VALUE 52
+#define ADAPTIVE_COOLDOWN_BACKOFF 0
+
+// Can't assert this in pycore_backoff.h because of header order dependencies
+#if COLD_EXIT_INITIAL_VALUE <= ADAPTIVE_COOLDOWN_VALUE
+#  error  "Cold exit value should be larger than adaptive cooldown value"
+#endif
+
+static inline _Py_BackoffCounter
+adaptive_counter_bits(uint16_t value, uint16_t backoff) {
+    return make_backoff_counter(value, backoff);
+}
+
+static inline _Py_BackoffCounter
+adaptive_counter_warmup(void) {
+    return adaptive_counter_bits(ADAPTIVE_WARMUP_VALUE,
+                                 ADAPTIVE_WARMUP_BACKOFF);
+}
+
+static inline _Py_BackoffCounter
+adaptive_counter_cooldown(void) {
+    return adaptive_counter_bits(ADAPTIVE_COOLDOWN_VALUE,
+                                 ADAPTIVE_COOLDOWN_BACKOFF);
+}
+
+static inline _Py_BackoffCounter
+adaptive_counter_backoff(_Py_BackoffCounter counter) {
+    return restart_backoff_counter(counter);
+}
+
+
+/* Comparison bit masks. */
+
+/* Note this evaluates its arguments twice each */
+#define COMPARISON_BIT(x, y) (1 << (2 * ((x) >= (y)) + ((x) <= (y))))
+
+/*
+ * The following bits are chosen so that the value of
+ * COMPARSION_BIT(left, right)
+ * masked by the values below will be non-zero if the
+ * comparison is true, and zero if it is false */
+
+/* This is for values that are unordered, ie. NaN, not types that are unordered, e.g. sets */
+#define COMPARISON_UNORDERED 1
+
+#define COMPARISON_LESS_THAN 2
+#define COMPARISON_GREATER_THAN 4
+#define COMPARISON_EQUALS 8
+
+#define COMPARISON_NOT_EQUALS (COMPARISON_UNORDERED | COMPARISON_LESS_THAN | COMPARISON_GREATER_THAN)
+
+extern int _Py_Instrument(PyCodeObject *co, PyInterpreterState *interp);
+
+extern int _Py_GetBaseOpcode(PyCodeObject *code, int offset);
+
+extern int _PyInstruction_GetLength(PyCodeObject *code, int offset);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CODE_H */
diff --git a/Include/internal/pycore_codecs.h b/Include/internal/pycore_codecs.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e2d5c5ce9d868a2b87824e458028e2b65ad922b
--- /dev/null
+++ b/Include/internal/pycore_codecs.h
@@ -0,0 +1,86 @@
+#ifndef Py_INTERNAL_CODECS_H
+#define Py_INTERNAL_CODECS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_lock.h"          // PyMutex
+
+/* Initialize codecs-related state for the given interpreter, including
+   registering the first codec search function. Must be called before any other
+   PyCodec-related functions, and while only one thread is active. */
+extern PyStatus _PyCodec_InitRegistry(PyInterpreterState *interp);
+
+/* Finalize codecs-related state for the given interpreter. No PyCodec-related
+   functions other than PyCodec_Unregister() may be called after this. */
+extern void _PyCodec_Fini(PyInterpreterState *interp);
+
+extern PyObject* _PyCodec_Lookup(const char *encoding);
+
+/* Text codec specific encoding and decoding API.
+
+   Checks the encoding against a list of codecs which do not
+   implement a str<->bytes encoding before attempting the
+   operation.
+
+   Please note that these APIs are internal and should not
+   be used in Python C extensions.
+
+   XXX (ncoghlan): should we make these, or something like them, public
+   in Python 3.5+?
+
+ */
+extern PyObject* _PyCodec_LookupTextEncoding(
+   const char *encoding,
+   const char *alternate_command);
+
+extern PyObject* _PyCodec_EncodeText(
+   PyObject *object,
+   const char *encoding,
+   const char *errors);
+
+extern PyObject* _PyCodec_DecodeText(
+   PyObject *object,
+   const char *encoding,
+   const char *errors);
+
+/* These two aren't actually text encoding specific, but _io.TextIOWrapper
+ * is the only current API consumer.
+ */
+extern PyObject* _PyCodecInfo_GetIncrementalDecoder(
+   PyObject *codec_info,
+   const char *errors);
+
+extern PyObject* _PyCodecInfo_GetIncrementalEncoder(
+   PyObject *codec_info,
+   const char *errors);
+
+// Per-interpreter state used by codecs.c.
+struct codecs_state {
+    // A list of callable objects used to search for codecs.
+    PyObject *search_path;
+
+    // A dict mapping codec names to codecs returned from a callable in
+    // search_path.
+    PyObject *search_cache;
+
+    // A dict mapping error handling strategies to functions to implement them.
+    PyObject *error_registry;
+
+#ifdef Py_GIL_DISABLED
+    // Used to safely delete a specific item from search_path.
+    PyMutex search_path_mutex;
+#endif
+
+    // Whether or not the rest of the state is initialized.
+    int initialized;
+};
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CODECS_H */
diff --git a/Include/internal/pycore_compile.h b/Include/internal/pycore_compile.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c21f83a18b52acb264ca6fa09ae602634654244
--- /dev/null
+++ b/Include/internal/pycore_compile.h
@@ -0,0 +1,118 @@
+#ifndef Py_INTERNAL_COMPILE_H
+#define Py_INTERNAL_COMPILE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_symtable.h"  // _Py_SourceLocation
+#include "pycore_instruction_sequence.h"
+
+struct _arena;   // Type defined in pycore_pyarena.h
+struct _mod;     // Type defined in pycore_ast.h
+
+// Export for 'test_peg_generator' shared extension
+PyAPI_FUNC(PyCodeObject*) _PyAST_Compile(
+    struct _mod *mod,
+    PyObject *filename,
+    PyCompilerFlags *flags,
+    int optimize,
+    struct _arena *arena);
+
+/* AST optimizations */
+extern int _PyCompile_AstOptimize(
+    struct _mod *mod,
+    PyObject *filename,
+    PyCompilerFlags *flags,
+    int optimize,
+    struct _arena *arena);
+
+struct _Py_SourceLocation;
+
+extern int _PyAST_Optimize(
+    struct _mod *,
+    struct _arena *arena,
+    int optimize,
+    int ff_features);
+
+
+typedef struct {
+    PyObject *u_name;
+    PyObject *u_qualname;  /* dot-separated qualified name (lazy) */
+
+    /* The following fields are dicts that map objects to
+       the index of them in co_XXX.      The index is used as
+       the argument for opcodes that refer to those collections.
+    */
+    PyObject *u_consts;    /* all constants */
+    PyObject *u_names;     /* all names */
+    PyObject *u_varnames;  /* local variables */
+    PyObject *u_cellvars;  /* cell variables */
+    PyObject *u_freevars;  /* free variables */
+    PyObject *u_fasthidden; /* dict; keys are names that are fast-locals only
+                               temporarily within an inlined comprehension. When
+                               value is True, treat as fast-local. */
+
+    Py_ssize_t u_argcount;        /* number of arguments for block */
+    Py_ssize_t u_posonlyargcount;        /* number of positional only arguments for block */
+    Py_ssize_t u_kwonlyargcount; /* number of keyword only arguments for block */
+
+    int u_firstlineno; /* the first lineno of the block */
+} _PyCompile_CodeUnitMetadata;
+
+
+/* Utility for a number of growing arrays used in the compiler */
+int _PyCompile_EnsureArrayLargeEnough(
+        int idx,
+        void **array,
+        int *alloc,
+        int default_alloc,
+        size_t item_size);
+
+int _PyCompile_ConstCacheMergeOne(PyObject *const_cache, PyObject **obj);
+
+
+// Export for '_opcode' extension module
+PyAPI_FUNC(int) _PyCompile_OpcodeIsValid(int opcode);
+PyAPI_FUNC(int) _PyCompile_OpcodeHasArg(int opcode);
+PyAPI_FUNC(int) _PyCompile_OpcodeHasConst(int opcode);
+PyAPI_FUNC(int) _PyCompile_OpcodeHasName(int opcode);
+PyAPI_FUNC(int) _PyCompile_OpcodeHasJump(int opcode);
+PyAPI_FUNC(int) _PyCompile_OpcodeHasFree(int opcode);
+PyAPI_FUNC(int) _PyCompile_OpcodeHasLocal(int opcode);
+PyAPI_FUNC(int) _PyCompile_OpcodeHasExc(int opcode);
+
+PyAPI_FUNC(PyObject*) _PyCompile_GetUnaryIntrinsicName(int index);
+PyAPI_FUNC(PyObject*) _PyCompile_GetBinaryIntrinsicName(int index);
+
+/* Access compiler internals for unit testing */
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(PyObject*) _PyCompile_CleanDoc(PyObject *doc);
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(PyObject*) _PyCompile_CodeGen(
+        PyObject *ast,
+        PyObject *filename,
+        PyCompilerFlags *flags,
+        int optimize,
+        int compile_mode);
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(PyObject*) _PyCompile_OptimizeCfg(
+        PyObject *instructions,
+        PyObject *consts,
+        int nlocals);
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(PyCodeObject*)
+_PyCompile_Assemble(_PyCompile_CodeUnitMetadata *umd, PyObject *filename,
+                    PyObject *instructions);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_COMPILE_H */
diff --git a/Include/internal/pycore_complexobject.h b/Include/internal/pycore_complexobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..54713536eedc462ce82400fd0fa0767dd9c0751d
--- /dev/null
+++ b/Include/internal/pycore_complexobject.h
@@ -0,0 +1,25 @@
+#ifndef Py_INTERNAL_COMPLEXOBJECT_H
+#define Py_INTERNAL_COMPLEXOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_unicodeobject.h" // _PyUnicodeWriter
+
+/* Format the object based on the format_spec, as defined in PEP 3101
+   (Advanced String Formatting). */
+extern int _PyComplex_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_COMPLEXOBJECT_H
diff --git a/Include/internal/pycore_condvar.h b/Include/internal/pycore_condvar.h
new file mode 100644
index 0000000000000000000000000000000000000000..55271f0a4116ba03fe5314751adb2cb4e0067a6c
--- /dev/null
+++ b/Include/internal/pycore_condvar.h
@@ -0,0 +1,93 @@
+#ifndef Py_INTERNAL_CONDVAR_H
+#define Py_INTERNAL_CONDVAR_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_pythread.h"      // _POSIX_THREADS
+
+
+#ifdef _POSIX_THREADS
+/*
+ * POSIX support
+ */
+#define Py_HAVE_CONDVAR
+
+#ifdef HAVE_PTHREAD_H
+#  include <pthread.h>            // pthread_mutex_t
+#endif
+
+#define PyMUTEX_T pthread_mutex_t
+#define PyCOND_T pthread_cond_t
+
+#elif defined(NT_THREADS)
+/*
+ * Windows (XP, 2003 server and later, as well as (hopefully) CE) support
+ *
+ * Emulated condition variables ones that work with XP and later, plus
+ * example native support on VISTA and onwards.
+ */
+#define Py_HAVE_CONDVAR
+
+/* include windows if it hasn't been done before */
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>              // CRITICAL_SECTION
+
+/* options */
+/* emulated condition variables are provided for those that want
+ * to target Windows XP or earlier. Modify this macro to enable them.
+ */
+#ifndef _PY_EMULATED_WIN_CV
+#define _PY_EMULATED_WIN_CV 0  /* use non-emulated condition variables */
+#endif
+
+/* fall back to emulation if targeting earlier than Vista */
+#if !defined NTDDI_VISTA || NTDDI_VERSION < NTDDI_VISTA
+#undef _PY_EMULATED_WIN_CV
+#define _PY_EMULATED_WIN_CV 1
+#endif
+
+#if _PY_EMULATED_WIN_CV
+
+typedef CRITICAL_SECTION PyMUTEX_T;
+
+/* The ConditionVariable object.  From XP onwards it is easily emulated
+   with a Semaphore.
+   Semaphores are available on Windows XP (2003 server) and later.
+   We use a Semaphore rather than an auto-reset event, because although
+   an auto-reset event might appear to solve the lost-wakeup bug (race
+   condition between releasing the outer lock and waiting) because it
+   maintains state even though a wait hasn't happened, there is still
+   a lost wakeup problem if more than one thread are interrupted in the
+   critical place.  A semaphore solves that, because its state is
+   counted, not Boolean.
+   Because it is ok to signal a condition variable with no one
+   waiting, we need to keep track of the number of
+   waiting threads.  Otherwise, the semaphore's state could rise
+   without bound.  This also helps reduce the number of "spurious wakeups"
+   that would otherwise happen.
+ */
+
+typedef struct _PyCOND_T
+{
+    HANDLE sem;
+    int waiting; /* to allow PyCOND_SIGNAL to be a no-op */
+} PyCOND_T;
+
+#else /* !_PY_EMULATED_WIN_CV */
+
+/* Use native Windows primitives if build target is Vista or higher */
+
+/* SRWLOCK is faster and better than CriticalSection */
+typedef SRWLOCK PyMUTEX_T;
+
+typedef CONDITION_VARIABLE  PyCOND_T;
+
+#endif /* _PY_EMULATED_WIN_CV */
+
+#endif /* _POSIX_THREADS, NT_THREADS */
+
+#endif /* Py_INTERNAL_CONDVAR_H */
diff --git a/Include/internal/pycore_context.h b/Include/internal/pycore_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..10c1f1e52be04000b8f49fb782c7e817465b3507
--- /dev/null
+++ b/Include/internal/pycore_context.h
@@ -0,0 +1,61 @@
+#ifndef Py_INTERNAL_CONTEXT_H
+#define Py_INTERNAL_CONTEXT_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_freelist.h"      // _PyFreeListState
+#include "pycore_hamt.h"          // PyHamtObject
+
+
+extern PyTypeObject _PyContextTokenMissing_Type;
+
+/* runtime lifecycle */
+
+PyStatus _PyContext_Init(PyInterpreterState *);
+
+
+/* other API */
+
+typedef struct {
+    PyObject_HEAD
+} _PyContextTokenMissing;
+
+struct _pycontextobject {
+    PyObject_HEAD
+    PyContext *ctx_prev;
+    PyHamtObject *ctx_vars;
+    PyObject *ctx_weakreflist;
+    int ctx_entered;
+};
+
+
+struct _pycontextvarobject {
+    PyObject_HEAD
+    PyObject *var_name;
+    PyObject *var_default;
+#ifndef Py_GIL_DISABLED
+    PyObject *var_cached;
+    uint64_t var_cached_tsid;
+    uint64_t var_cached_tsver;
+#endif
+    Py_hash_t var_hash;
+};
+
+
+struct _pycontexttokenobject {
+    PyObject_HEAD
+    PyContext *tok_ctx;
+    PyContextVar *tok_var;
+    PyObject *tok_oldval;
+    int tok_used;
+};
+
+
+// _testinternalcapi.hamt() used by tests.
+// Export for '_testcapi' shared extension
+PyAPI_FUNC(PyObject*) _PyContext_NewHamtForTests(void);
+
+
+#endif /* !Py_INTERNAL_CONTEXT_H */
diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h
new file mode 100644
index 0000000000000000000000000000000000000000..78cd0d549726608fed94f1a3b2022944d263345f
--- /dev/null
+++ b/Include/internal/pycore_critical_section.h
@@ -0,0 +1,233 @@
+#ifndef Py_INTERNAL_CRITICAL_SECTION_H
+#define Py_INTERNAL_CRITICAL_SECTION_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_lock.h"        // PyMutex
+#include "pycore_pystate.h"     // _PyThreadState_GET()
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Tagged pointers to critical sections use the two least significant bits to
+// mark if the pointed-to critical section is inactive and whether it is a
+// PyCriticalSection2 object.
+#define _Py_CRITICAL_SECTION_INACTIVE       0x1
+#define _Py_CRITICAL_SECTION_TWO_MUTEXES    0x2
+#define _Py_CRITICAL_SECTION_MASK           0x3
+
+#ifdef Py_GIL_DISABLED
+# define Py_BEGIN_CRITICAL_SECTION_MUT(mutex)                           \
+    {                                                                   \
+        PyCriticalSection _py_cs;                                       \
+        _PyCriticalSection_BeginMutex(&_py_cs, mutex)
+
+# define Py_BEGIN_CRITICAL_SECTION2_MUT(m1, m2)                         \
+    {                                                                   \
+        PyCriticalSection2 _py_cs2;                                     \
+        _PyCriticalSection2_BeginMutex(&_py_cs2, m1, m2)
+
+// Specialized version of critical section locking to safely use
+// PySequence_Fast APIs without the GIL. For performance, the argument *to*
+// PySequence_Fast() is provided to the macro, not the *result* of
+// PySequence_Fast(), which would require an extra test to determine if the
+// lock must be acquired.
+# define Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(original)              \
+    {                                                                   \
+        PyObject *_orig_seq = _PyObject_CAST(original);                 \
+        const bool _should_lock_cs = PyList_CheckExact(_orig_seq);      \
+        PyCriticalSection _cs;                                          \
+        if (_should_lock_cs) {                                          \
+            _PyCriticalSection_Begin(&_cs, _orig_seq);                  \
+        }
+
+# define Py_END_CRITICAL_SECTION_SEQUENCE_FAST()                        \
+        if (_should_lock_cs) {                                          \
+            PyCriticalSection_End(&_cs);                                \
+        }                                                               \
+    }
+
+// Asserts that the mutex is locked.  The mutex must be held by the
+// top-most critical section otherwise there's the possibility
+// that the mutex would be swalled out in some code paths.
+#define _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(mutex) \
+    _PyCriticalSection_AssertHeld(mutex)
+
+// Asserts that the mutex for the given object is locked. The mutex must
+// be held by the top-most critical section otherwise there's the
+// possibility that the mutex would be swalled out in some code paths.
+#ifdef Py_DEBUG
+
+# define _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(op)                           \
+    if (Py_REFCNT(op) != 1) {                                                    \
+        _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(&_PyObject_CAST(op)->ob_mutex); \
+    }
+
+#else   /* Py_DEBUG */
+
+# define _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(op)
+
+#endif  /* Py_DEBUG */
+
+#else  /* !Py_GIL_DISABLED */
+// The critical section APIs are no-ops with the GIL.
+# define Py_BEGIN_CRITICAL_SECTION_MUT(mut) {
+# define Py_BEGIN_CRITICAL_SECTION2_MUT(m1, m2) {
+# define Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(original) {
+# define Py_END_CRITICAL_SECTION_SEQUENCE_FAST() }
+# define _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(mutex)
+# define _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(op)
+#endif  /* !Py_GIL_DISABLED */
+
+// Resumes the top-most critical section.
+PyAPI_FUNC(void)
+_PyCriticalSection_Resume(PyThreadState *tstate);
+
+// (private) slow path for locking the mutex
+PyAPI_FUNC(void)
+_PyCriticalSection_BeginSlow(PyCriticalSection *c, PyMutex *m);
+
+PyAPI_FUNC(void)
+_PyCriticalSection2_BeginSlow(PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2,
+                             int is_m1_locked);
+
+PyAPI_FUNC(void)
+_PyCriticalSection_SuspendAll(PyThreadState *tstate);
+
+#ifdef Py_GIL_DISABLED
+
+static inline int
+_PyCriticalSection_IsActive(uintptr_t tag)
+{
+    return tag != 0 && (tag & _Py_CRITICAL_SECTION_INACTIVE) == 0;
+}
+
+static inline void
+_PyCriticalSection_BeginMutex(PyCriticalSection *c, PyMutex *m)
+{
+    if (PyMutex_LockFast(&m->_bits)) {
+        PyThreadState *tstate = _PyThreadState_GET();
+        c->_cs_mutex = m;
+        c->_cs_prev = tstate->critical_section;
+        tstate->critical_section = (uintptr_t)c;
+    }
+    else {
+        _PyCriticalSection_BeginSlow(c, m);
+    }
+}
+
+static inline void
+_PyCriticalSection_Begin(PyCriticalSection *c, PyObject *op)
+{
+    _PyCriticalSection_BeginMutex(c, &op->ob_mutex);
+}
+#define PyCriticalSection_Begin _PyCriticalSection_Begin
+
+// Removes the top-most critical section from the thread's stack of critical
+// sections. If the new top-most critical section is inactive, then it is
+// resumed.
+static inline void
+_PyCriticalSection_Pop(PyCriticalSection *c)
+{
+    PyThreadState *tstate = _PyThreadState_GET();
+    uintptr_t prev = c->_cs_prev;
+    tstate->critical_section = prev;
+
+    if ((prev & _Py_CRITICAL_SECTION_INACTIVE) != 0) {
+        _PyCriticalSection_Resume(tstate);
+    }
+}
+
+static inline void
+_PyCriticalSection_End(PyCriticalSection *c)
+{
+    PyMutex_Unlock(c->_cs_mutex);
+    _PyCriticalSection_Pop(c);
+}
+#define PyCriticalSection_End _PyCriticalSection_End
+
+static inline void
+_PyCriticalSection2_BeginMutex(PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2)
+{
+    if (m1 == m2) {
+        // If the two mutex arguments are the same, treat this as a critical
+        // section with a single mutex.
+        c->_cs_mutex2 = NULL;
+        _PyCriticalSection_BeginMutex(&c->_cs_base, m1);
+        return;
+    }
+
+    if ((uintptr_t)m2 < (uintptr_t)m1) {
+        // Sort the mutexes so that the lower address is locked first.
+        // The exact order does not matter, but we need to acquire the mutexes
+        // in a consistent order to avoid lock ordering deadlocks.
+        PyMutex *tmp = m1;
+        m1 = m2;
+        m2 = tmp;
+    }
+
+    if (PyMutex_LockFast(&m1->_bits)) {
+        if (PyMutex_LockFast(&m2->_bits)) {
+            PyThreadState *tstate = _PyThreadState_GET();
+            c->_cs_base._cs_mutex = m1;
+            c->_cs_mutex2 = m2;
+            c->_cs_base._cs_prev = tstate->critical_section;
+
+            uintptr_t p = (uintptr_t)c | _Py_CRITICAL_SECTION_TWO_MUTEXES;
+            tstate->critical_section = p;
+        }
+        else {
+            _PyCriticalSection2_BeginSlow(c, m1, m2, 1);
+        }
+    }
+    else {
+        _PyCriticalSection2_BeginSlow(c, m1, m2, 0);
+    }
+}
+
+static inline void
+_PyCriticalSection2_Begin(PyCriticalSection2 *c, PyObject *a, PyObject *b)
+{
+    _PyCriticalSection2_BeginMutex(c, &a->ob_mutex, &b->ob_mutex);
+}
+#define PyCriticalSection2_Begin _PyCriticalSection2_Begin
+
+static inline void
+_PyCriticalSection2_End(PyCriticalSection2 *c)
+{
+    if (c->_cs_mutex2) {
+        PyMutex_Unlock(c->_cs_mutex2);
+    }
+    PyMutex_Unlock(c->_cs_base._cs_mutex);
+    _PyCriticalSection_Pop(&c->_cs_base);
+}
+#define PyCriticalSection2_End _PyCriticalSection2_End
+
+static inline void
+_PyCriticalSection_AssertHeld(PyMutex *mutex)
+{
+#ifdef Py_DEBUG
+    PyThreadState *tstate = _PyThreadState_GET();
+    uintptr_t prev = tstate->critical_section;
+    if (prev & _Py_CRITICAL_SECTION_TWO_MUTEXES) {
+        PyCriticalSection2 *cs = (PyCriticalSection2 *)(prev & ~_Py_CRITICAL_SECTION_MASK);
+        assert(cs != NULL && (cs->_cs_base._cs_mutex == mutex || cs->_cs_mutex2 == mutex));
+    }
+    else {
+        PyCriticalSection *cs = (PyCriticalSection *)(tstate->critical_section & ~_Py_CRITICAL_SECTION_MASK);
+        assert(cs != NULL && cs->_cs_mutex == mutex);
+    }
+
+#endif
+}
+
+#endif /* Py_GIL_DISABLED */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CRITICAL_SECTION_H */
diff --git a/Include/internal/pycore_crossinterp.h b/Include/internal/pycore_crossinterp.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dd165eae74850f9b5d2d3f7796978476c27e51f
--- /dev/null
+++ b/Include/internal/pycore_crossinterp.h
@@ -0,0 +1,340 @@
+#ifndef Py_INTERNAL_CROSSINTERP_H
+#define Py_INTERNAL_CROSSINTERP_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_lock.h"            // PyMutex
+#include "pycore_pyerrors.h"
+
+/**************/
+/* exceptions */
+/**************/
+
+PyAPI_DATA(PyObject *) PyExc_InterpreterError;
+PyAPI_DATA(PyObject *) PyExc_InterpreterNotFoundError;
+
+
+/***************************/
+/* cross-interpreter calls */
+/***************************/
+
+typedef int (*_Py_simple_func)(void *);
+extern int _Py_CallInInterpreter(
+    PyInterpreterState *interp,
+    _Py_simple_func func,
+    void *arg);
+extern int _Py_CallInInterpreterAndRawFree(
+    PyInterpreterState *interp,
+    _Py_simple_func func,
+    void *arg);
+
+
+/**************************/
+/* cross-interpreter data */
+/**************************/
+
+typedef struct _xid _PyCrossInterpreterData;
+typedef PyObject *(*xid_newobjectfunc)(_PyCrossInterpreterData *);
+typedef void (*xid_freefunc)(void *);
+
+// _PyCrossInterpreterData is similar to Py_buffer as an effectively
+// opaque struct that holds data outside the object machinery.  This
+// is necessary to pass safely between interpreters in the same process.
+struct _xid {
+    // data is the cross-interpreter-safe derivation of a Python object
+    // (see _PyObject_GetCrossInterpreterData).  It will be NULL if the
+    // new_object func (below) encodes the data.
+    void *data;
+    // obj is the Python object from which the data was derived.  This
+    // is non-NULL only if the data remains bound to the object in some
+    // way, such that the object must be "released" (via a decref) when
+    // the data is released.  In that case the code that sets the field,
+    // likely a registered "crossinterpdatafunc", is responsible for
+    // ensuring it owns the reference (i.e. incref).
+    PyObject *obj;
+    // interp is the ID of the owning interpreter of the original
+    // object.  It corresponds to the active interpreter when
+    // _PyObject_GetCrossInterpreterData() was called.  This should only
+    // be set by the cross-interpreter machinery.
+    //
+    // We use the ID rather than the PyInterpreterState to avoid issues
+    // with deleted interpreters.  Note that IDs are never re-used, so
+    // each one will always correspond to a specific interpreter
+    // (whether still alive or not).
+    int64_t interpid;
+    // new_object is a function that returns a new object in the current
+    // interpreter given the data.  The resulting object (a new
+    // reference) will be equivalent to the original object.  This field
+    // is required.
+    xid_newobjectfunc new_object;
+    // free is called when the data is released.  If it is NULL then
+    // nothing will be done to free the data.  For some types this is
+    // okay (e.g. bytes) and for those types this field should be set
+    // to NULL.  However, for most the data was allocated just for
+    // cross-interpreter use, so it must be freed when
+    // _PyCrossInterpreterData_Release is called or the memory will
+    // leak.  In that case, at the very least this field should be set
+    // to PyMem_RawFree (the default if not explicitly set to NULL).
+    // The call will happen with the original interpreter activated.
+    xid_freefunc free;
+};
+
+PyAPI_FUNC(_PyCrossInterpreterData *) _PyCrossInterpreterData_New(void);
+PyAPI_FUNC(void) _PyCrossInterpreterData_Free(_PyCrossInterpreterData *data);
+
+#define _PyCrossInterpreterData_DATA(DATA) ((DATA)->data)
+#define _PyCrossInterpreterData_OBJ(DATA) ((DATA)->obj)
+#define _PyCrossInterpreterData_INTERPID(DATA) ((DATA)->interpid)
+// Users should not need getters for "new_object" or "free".
+
+
+/* defining cross-interpreter data */
+
+PyAPI_FUNC(void) _PyCrossInterpreterData_Init(
+        _PyCrossInterpreterData *data,
+        PyInterpreterState *interp, void *shared, PyObject *obj,
+        xid_newobjectfunc new_object);
+PyAPI_FUNC(int) _PyCrossInterpreterData_InitWithSize(
+        _PyCrossInterpreterData *,
+        PyInterpreterState *interp, const size_t, PyObject *,
+        xid_newobjectfunc);
+PyAPI_FUNC(void) _PyCrossInterpreterData_Clear(
+        PyInterpreterState *, _PyCrossInterpreterData *);
+
+// Normally the Init* functions are sufficient.  The only time
+// additional initialization might be needed is to set the "free" func,
+// though that should be infrequent.
+#define _PyCrossInterpreterData_SET_FREE(DATA, FUNC) \
+    do { \
+        (DATA)->free = (FUNC); \
+    } while (0)
+// Additionally, some shareable types are essentially light wrappers
+// around other shareable types.  The crossinterpdatafunc of the wrapper
+// can often be implemented by calling the wrapped object's
+// crossinterpdatafunc and then changing the "new_object" function.
+// We have _PyCrossInterpreterData_SET_NEW_OBJECT() here for that,
+// but might be better to have a function like
+// _PyCrossInterpreterData_AdaptToWrapper() instead.
+#define _PyCrossInterpreterData_SET_NEW_OBJECT(DATA, FUNC) \
+    do { \
+        (DATA)->new_object = (FUNC); \
+    } while (0)
+
+
+/* using cross-interpreter data */
+
+PyAPI_FUNC(int) _PyObject_CheckCrossInterpreterData(PyObject *);
+PyAPI_FUNC(int) _PyObject_GetCrossInterpreterData(PyObject *, _PyCrossInterpreterData *);
+PyAPI_FUNC(PyObject *) _PyCrossInterpreterData_NewObject(_PyCrossInterpreterData *);
+PyAPI_FUNC(int) _PyCrossInterpreterData_Release(_PyCrossInterpreterData *);
+PyAPI_FUNC(int) _PyCrossInterpreterData_ReleaseAndRawFree(_PyCrossInterpreterData *);
+
+
+/* cross-interpreter data registry */
+
+// For now we use a global registry of shareable classes.  An
+// alternative would be to add a tp_* slot for a class's
+// crossinterpdatafunc. It would be simpler and more efficient.
+
+typedef int (*crossinterpdatafunc)(PyThreadState *tstate, PyObject *,
+                                   _PyCrossInterpreterData *);
+
+struct _xidregitem;
+
+struct _xidregitem {
+    struct _xidregitem *prev;
+    struct _xidregitem *next;
+    /* This can be a dangling pointer, but only if weakref is set. */
+    PyTypeObject *cls;
+    /* This is NULL for builtin types. */
+    PyObject *weakref;
+    size_t refcount;
+    crossinterpdatafunc getdata;
+};
+
+struct _xidregistry {
+    int global;  /* builtin types or heap types */
+    int initialized;
+    PyMutex mutex;
+    struct _xidregitem *head;
+};
+
+PyAPI_FUNC(int) _PyCrossInterpreterData_RegisterClass(PyTypeObject *, crossinterpdatafunc);
+PyAPI_FUNC(int) _PyCrossInterpreterData_UnregisterClass(PyTypeObject *);
+PyAPI_FUNC(crossinterpdatafunc) _PyCrossInterpreterData_Lookup(PyObject *);
+
+
+/*****************************/
+/* runtime state & lifecycle */
+/*****************************/
+
+struct _xi_runtime_state {
+    // builtin types
+    // XXX Remove this field once we have a tp_* slot.
+    struct _xidregistry registry;
+};
+
+struct _xi_state {
+    // heap types
+    // XXX Remove this field once we have a tp_* slot.
+    struct _xidregistry registry;
+
+    // heap types
+    PyObject *PyExc_NotShareableError;
+};
+
+extern PyStatus _PyXI_Init(PyInterpreterState *interp);
+extern void _PyXI_Fini(PyInterpreterState *interp);
+
+extern PyStatus _PyXI_InitTypes(PyInterpreterState *interp);
+extern void _PyXI_FiniTypes(PyInterpreterState *interp);
+
+#define _PyInterpreterState_GetXIState(interp) (&(interp)->xi)
+
+
+/***************************/
+/* short-term data sharing */
+/***************************/
+
+// Ultimately we'd like to preserve enough information about the
+// exception and traceback that we could re-constitute (or at least
+// simulate, a la traceback.TracebackException), and even chain, a copy
+// of the exception in the calling interpreter.
+
+typedef struct _excinfo {
+    struct _excinfo_type {
+        PyTypeObject *builtin;
+        const char *name;
+        const char *qualname;
+        const char *module;
+    } type;
+    const char *msg;
+    const char *errdisplay;
+} _PyXI_excinfo;
+
+PyAPI_FUNC(int) _PyXI_InitExcInfo(_PyXI_excinfo *info, PyObject *exc);
+PyAPI_FUNC(PyObject *) _PyXI_FormatExcInfo(_PyXI_excinfo *info);
+PyAPI_FUNC(PyObject *) _PyXI_ExcInfoAsObject(_PyXI_excinfo *info);
+PyAPI_FUNC(void) _PyXI_ClearExcInfo(_PyXI_excinfo *info);
+
+
+typedef enum error_code {
+    _PyXI_ERR_NO_ERROR = 0,
+    _PyXI_ERR_UNCAUGHT_EXCEPTION = -1,
+    _PyXI_ERR_OTHER = -2,
+    _PyXI_ERR_NO_MEMORY = -3,
+    _PyXI_ERR_ALREADY_RUNNING = -4,
+    _PyXI_ERR_MAIN_NS_FAILURE = -5,
+    _PyXI_ERR_APPLY_NS_FAILURE = -6,
+    _PyXI_ERR_NOT_SHAREABLE = -7,
+} _PyXI_errcode;
+
+
+typedef struct _sharedexception {
+    // The originating interpreter.
+    PyInterpreterState *interp;
+    // The kind of error to propagate.
+    _PyXI_errcode code;
+    // The exception information to propagate, if applicable.
+    // This is populated only for some error codes,
+    // but always for _PyXI_ERR_UNCAUGHT_EXCEPTION.
+    _PyXI_excinfo uncaught;
+} _PyXI_error;
+
+PyAPI_FUNC(PyObject *) _PyXI_ApplyError(_PyXI_error *err);
+
+
+typedef struct xi_session _PyXI_session;
+typedef struct _sharedns _PyXI_namespace;
+
+PyAPI_FUNC(void) _PyXI_FreeNamespace(_PyXI_namespace *ns);
+PyAPI_FUNC(_PyXI_namespace *) _PyXI_NamespaceFromNames(PyObject *names);
+PyAPI_FUNC(int) _PyXI_FillNamespaceFromDict(
+    _PyXI_namespace *ns,
+    PyObject *nsobj,
+    _PyXI_session *session);
+PyAPI_FUNC(int) _PyXI_ApplyNamespace(
+    _PyXI_namespace *ns,
+    PyObject *nsobj,
+    PyObject *dflt);
+
+
+// A cross-interpreter session involves entering an interpreter
+// (_PyXI_Enter()), doing some work with it, and finally exiting
+// that interpreter (_PyXI_Exit()).
+//
+// At the boundaries of the session, both entering and exiting,
+// data may be exchanged between the previous interpreter and the
+// target one in a thread-safe way that does not violate the
+// isolation between interpreters.  This includes setting objects
+// in the target's __main__ module on the way in, and capturing
+// uncaught exceptions on the way out.
+struct xi_session {
+    // Once a session has been entered, this is the tstate that was
+    // current before the session.  If it is different from cur_tstate
+    // then we must have switched interpreters.  Either way, this will
+    // be the current tstate once we exit the session.
+    PyThreadState *prev_tstate;
+    // Once a session has been entered, this is the current tstate.
+    // It must be current when the session exits.
+    PyThreadState *init_tstate;
+    // This is true if init_tstate needs cleanup during exit.
+    int own_init_tstate;
+
+    // This is true if, while entering the session, init_thread took
+    // "ownership" of the interpreter's __main__ module.  This means
+    // it is the only thread that is allowed to run code there.
+    // (Caveat: for now, users may still run exec() against the
+    // __main__ module's dict, though that isn't advisable.)
+    int running;
+    // This is a cached reference to the __dict__ of the entered
+    // interpreter's __main__ module.  It is looked up when at the
+    // beginning of the session as a convenience.
+    PyObject *main_ns;
+
+    // This is set if the interpreter is entered and raised an exception
+    // that needs to be handled in some special way during exit.
+    _PyXI_errcode *error_override;
+    // This is set if exit captured an exception to propagate.
+    _PyXI_error *error;
+
+    // -- pre-allocated memory --
+    _PyXI_error _error;
+    _PyXI_errcode _error_override;
+};
+
+PyAPI_FUNC(int) _PyXI_Enter(
+    _PyXI_session *session,
+    PyInterpreterState *interp,
+    PyObject *nsupdates);
+PyAPI_FUNC(void) _PyXI_Exit(_PyXI_session *session);
+
+PyAPI_FUNC(PyObject *) _PyXI_ApplyCapturedException(_PyXI_session *session);
+PyAPI_FUNC(int) _PyXI_HasCapturedException(_PyXI_session *session);
+
+
+/*************/
+/* other API */
+/*************/
+
+// Export for _testinternalcapi shared extension
+PyAPI_FUNC(PyInterpreterState *) _PyXI_NewInterpreter(
+    PyInterpreterConfig *config,
+    long *maybe_whence,
+    PyThreadState **p_tstate,
+    PyThreadState **p_save_tstate);
+PyAPI_FUNC(void) _PyXI_EndInterpreter(
+    PyInterpreterState *interp,
+    PyThreadState *tstate,
+    PyThreadState **p_save_tstate);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CROSSINTERP_H */
diff --git a/Include/internal/pycore_descrobject.h b/Include/internal/pycore_descrobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cec59a68a3d2b2af5320fe1435c13a4aa66886c
--- /dev/null
+++ b/Include/internal/pycore_descrobject.h
@@ -0,0 +1,28 @@
+#ifndef Py_INTERNAL_DESCROBJECT_H
+#define Py_INTERNAL_DESCROBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *prop_get;
+    PyObject *prop_set;
+    PyObject *prop_del;
+    PyObject *prop_doc;
+    PyObject *prop_name;
+    int getter_doc;
+} propertyobject;
+
+typedef propertyobject _PyPropertyObject;
+
+extern PyTypeObject _PyMethodWrapper_Type;
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_DESCROBJECT_H */
diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h
new file mode 100644
index 0000000000000000000000000000000000000000..36da498db2c3e147d0b26fa390f57de2e88f9f16
--- /dev/null
+++ b/Include/internal/pycore_dict.h
@@ -0,0 +1,340 @@
+#ifndef Py_INTERNAL_DICT_H
+#define Py_INTERNAL_DICT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_freelist.h"             // _PyFreeListState
+#include "pycore_identifier.h"           // _Py_Identifier
+#include "pycore_object.h"               // PyManagedDictPointer
+#include "pycore_pyatomic_ft_wrappers.h" // FT_ATOMIC_LOAD_SSIZE_ACQUIRE
+
+// Unsafe flavor of PyDict_GetItemWithError(): no error checking
+extern PyObject* _PyDict_GetItemWithError(PyObject *dp, PyObject *key);
+
+// Delete an item from a dict if a predicate is true
+// Returns -1 on error, 1 if the item was deleted, 0 otherwise
+// Export for '_asyncio' shared extension
+PyAPI_FUNC(int) _PyDict_DelItemIf(PyObject *mp, PyObject *key,
+                                  int (*predicate)(PyObject *value, void *arg),
+                                  void *arg);
+
+// "KnownHash" variants
+// Export for '_asyncio' shared extension
+PyAPI_FUNC(int) _PyDict_SetItem_KnownHash(PyObject *mp, PyObject *key,
+                                          PyObject *item, Py_hash_t hash);
+// Export for '_asyncio' shared extension
+PyAPI_FUNC(int) _PyDict_DelItem_KnownHash(PyObject *mp, PyObject *key,
+                                          Py_hash_t hash);
+extern int _PyDict_Contains_KnownHash(PyObject *, PyObject *, Py_hash_t);
+
+// "Id" variants
+extern PyObject* _PyDict_GetItemIdWithError(PyObject *dp,
+                                            _Py_Identifier *key);
+extern int _PyDict_ContainsId(PyObject *, _Py_Identifier *);
+extern int _PyDict_SetItemId(PyObject *dp, _Py_Identifier *key, PyObject *item);
+extern int _PyDict_DelItemId(PyObject *mp, _Py_Identifier *key);
+
+extern int _PyDict_Next(
+    PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value, Py_hash_t *hash);
+
+extern int _PyDict_HasOnlyStringKeys(PyObject *mp);
+
+extern void _PyDict_MaybeUntrack(PyObject *mp);
+
+// Export for '_ctypes' shared extension
+PyAPI_FUNC(Py_ssize_t) _PyDict_SizeOf(PyDictObject *);
+
+#define _PyDict_HasSplitTable(d) ((d)->ma_values != NULL)
+
+/* Like PyDict_Merge, but override can be 0, 1 or 2.  If override is 0,
+   the first occurrence of a key wins, if override is 1, the last occurrence
+   of a key wins, if override is 2, a KeyError with conflicting key as
+   argument is raised.
+*/
+PyAPI_FUNC(int) _PyDict_MergeEx(PyObject *mp, PyObject *other, int override);
+
+extern void _PyDict_DebugMallocStats(FILE *out);
+
+
+/* _PyDictView */
+
+typedef struct {
+    PyObject_HEAD
+    PyDictObject *dv_dict;
+} _PyDictViewObject;
+
+extern PyObject* _PyDictView_New(PyObject *, PyTypeObject *);
+extern PyObject* _PyDictView_Intersect(PyObject* self, PyObject *other);
+
+/* other API */
+
+typedef struct {
+    /* Cached hash code of me_key. */
+    Py_hash_t me_hash;
+    PyObject *me_key;
+    PyObject *me_value; /* This field is only meaningful for combined tables */
+} PyDictKeyEntry;
+
+typedef struct {
+    PyObject *me_key;   /* The key must be Unicode and have hash. */
+    PyObject *me_value; /* This field is only meaningful for combined tables */
+} PyDictUnicodeEntry;
+
+extern PyDictKeysObject *_PyDict_NewKeysForClass(void);
+extern PyObject *_PyDict_FromKeys(PyObject *, PyObject *, PyObject *);
+
+/* Gets a version number unique to the current state of the keys of dict, if possible.
+ * Returns the version number, or zero if it was not possible to get a version number. */
+extern uint32_t _PyDictKeys_GetVersionForCurrentState(
+        PyInterpreterState *interp, PyDictKeysObject *dictkeys);
+
+extern size_t _PyDict_KeysSize(PyDictKeysObject *keys);
+
+extern void _PyDictKeys_DecRef(PyDictKeysObject *keys);
+
+/* _Py_dict_lookup() returns index of entry which can be used like DK_ENTRIES(dk)[index].
+ * -1 when no entry found, -3 when compare raises error.
+ */
+extern Py_ssize_t _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **value_addr);
+extern Py_ssize_t _Py_dict_lookup_threadsafe(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **value_addr);
+
+extern Py_ssize_t _PyDict_LookupIndex(PyDictObject *, PyObject *);
+extern Py_ssize_t _PyDictKeys_StringLookup(PyDictKeysObject* dictkeys, PyObject *key);
+PyAPI_FUNC(PyObject *)_PyDict_LoadGlobal(PyDictObject *, PyDictObject *, PyObject *);
+
+/* Consumes references to key and value */
+PyAPI_FUNC(int) _PyDict_SetItem_Take2(PyDictObject *op, PyObject *key, PyObject *value);
+extern int _PyDict_SetItem_LockHeld(PyDictObject *dict, PyObject *name, PyObject *value);
+// Export for '_asyncio' shared extension
+PyAPI_FUNC(int) _PyDict_SetItem_KnownHash_LockHeld(PyDictObject *mp, PyObject *key,
+                                                   PyObject *value, Py_hash_t hash);
+// Export for '_asyncio' shared extension
+PyAPI_FUNC(int) _PyDict_GetItemRef_KnownHash_LockHeld(PyDictObject *op, PyObject *key, Py_hash_t hash, PyObject **result);
+extern int _PyDict_GetItemRef_KnownHash(PyDictObject *op, PyObject *key, Py_hash_t hash, PyObject **result);
+extern int _PyDict_GetItemRef_Unicode_LockHeld(PyDictObject *op, PyObject *key, PyObject **result);
+extern int _PyObjectDict_SetItem(PyTypeObject *tp, PyObject *obj, PyObject **dictptr, PyObject *name, PyObject *value);
+
+extern int _PyDict_Pop_KnownHash(
+    PyDictObject *dict,
+    PyObject *key,
+    Py_hash_t hash,
+    PyObject **result);
+
+#define DKIX_EMPTY (-1)
+#define DKIX_DUMMY (-2)  /* Used internally */
+#define DKIX_ERROR (-3)
+#define DKIX_KEY_CHANGED (-4) /* Used internally */
+
+typedef enum {
+    DICT_KEYS_GENERAL = 0,
+    DICT_KEYS_UNICODE = 1,
+    DICT_KEYS_SPLIT = 2
+} DictKeysKind;
+
+/* See dictobject.c for actual layout of DictKeysObject */
+struct _dictkeysobject {
+    Py_ssize_t dk_refcnt;
+
+    /* Size of the hash table (dk_indices). It must be a power of 2. */
+    uint8_t dk_log2_size;
+
+    /* Size of the hash table (dk_indices) by bytes. */
+    uint8_t dk_log2_index_bytes;
+
+    /* Kind of keys */
+    uint8_t dk_kind;
+
+#ifdef Py_GIL_DISABLED
+    /* Lock used to protect shared keys */
+    PyMutex dk_mutex;
+#endif
+
+    /* Version number -- Reset to 0 by any modification to keys */
+    uint32_t dk_version;
+
+    /* Number of usable entries in dk_entries. */
+    Py_ssize_t dk_usable;
+
+    /* Number of used entries in dk_entries. */
+    Py_ssize_t dk_nentries;
+
+
+    /* Actual hash table of dk_size entries. It holds indices in dk_entries,
+       or DKIX_EMPTY(-1) or DKIX_DUMMY(-2).
+
+       Indices must be: 0 <= indice < USABLE_FRACTION(dk_size).
+
+       The size in bytes of an indice depends on dk_size:
+
+       - 1 byte if dk_size <= 0xff (char*)
+       - 2 bytes if dk_size <= 0xffff (int16_t*)
+       - 4 bytes if dk_size <= 0xffffffff (int32_t*)
+       - 8 bytes otherwise (int64_t*)
+
+       Dynamically sized, SIZEOF_VOID_P is minimum. */
+    char dk_indices[];  /* char is required to avoid strict aliasing. */
+
+    /* "PyDictKeyEntry or PyDictUnicodeEntry dk_entries[USABLE_FRACTION(DK_SIZE(dk))];" array follows:
+       see the DK_ENTRIES() / DK_UNICODE_ENTRIES() functions below */
+};
+
+/* This must be no more than 250, for the prefix size to fit in one byte. */
+#define SHARED_KEYS_MAX_SIZE 30
+#define NEXT_LOG2_SHARED_KEYS_MAX_SIZE 6
+
+/* Layout of dict values:
+ *
+ * The PyObject *values are preceded by an array of bytes holding
+ * the insertion order and size.
+ * [-1] = prefix size. [-2] = used size. size[-2-n...] = insertion order.
+ */
+struct _dictvalues {
+    uint8_t capacity;
+    uint8_t size;
+    uint8_t embedded;
+    uint8_t valid;
+    PyObject *values[1];
+};
+
+#define DK_LOG_SIZE(dk)  _Py_RVALUE((dk)->dk_log2_size)
+#if SIZEOF_VOID_P > 4
+#define DK_SIZE(dk)      (((int64_t)1)<<DK_LOG_SIZE(dk))
+#else
+#define DK_SIZE(dk)      (1<<DK_LOG_SIZE(dk))
+#endif
+
+static inline void* _DK_ENTRIES(PyDictKeysObject *dk) {
+    int8_t *indices = (int8_t*)(dk->dk_indices);
+    size_t index = (size_t)1 << dk->dk_log2_index_bytes;
+    return (&indices[index]);
+}
+
+static inline PyDictKeyEntry* DK_ENTRIES(PyDictKeysObject *dk) {
+    assert(dk->dk_kind == DICT_KEYS_GENERAL);
+    return (PyDictKeyEntry*)_DK_ENTRIES(dk);
+}
+static inline PyDictUnicodeEntry* DK_UNICODE_ENTRIES(PyDictKeysObject *dk) {
+    assert(dk->dk_kind != DICT_KEYS_GENERAL);
+    return (PyDictUnicodeEntry*)_DK_ENTRIES(dk);
+}
+
+#define DK_IS_UNICODE(dk) ((dk)->dk_kind != DICT_KEYS_GENERAL)
+
+#define DICT_VERSION_INCREMENT (1 << (DICT_MAX_WATCHERS + DICT_WATCHED_MUTATION_BITS))
+#define DICT_WATCHER_MASK ((1 << DICT_MAX_WATCHERS) - 1)
+#define DICT_WATCHER_AND_MODIFICATION_MASK ((1 << (DICT_MAX_WATCHERS + DICT_WATCHED_MUTATION_BITS)) - 1)
+
+#ifdef Py_GIL_DISABLED
+
+#define THREAD_LOCAL_DICT_VERSION_COUNT 256
+#define THREAD_LOCAL_DICT_VERSION_BATCH THREAD_LOCAL_DICT_VERSION_COUNT * DICT_VERSION_INCREMENT
+
+static inline uint64_t
+dict_next_version(PyInterpreterState *interp)
+{
+    PyThreadState *tstate = PyThreadState_GET();
+    uint64_t cur_progress = (tstate->dict_global_version &
+                            (THREAD_LOCAL_DICT_VERSION_BATCH - 1));
+    if (cur_progress == 0) {
+        uint64_t next = _Py_atomic_add_uint64(&interp->dict_state.global_version,
+                                              THREAD_LOCAL_DICT_VERSION_BATCH);
+        tstate->dict_global_version = next;
+    }
+    return tstate->dict_global_version += DICT_VERSION_INCREMENT;
+}
+
+#define DICT_NEXT_VERSION(INTERP) dict_next_version(INTERP)
+
+#else
+#define DICT_NEXT_VERSION(INTERP) \
+    ((INTERP)->dict_state.global_version += DICT_VERSION_INCREMENT)
+#endif
+
+void
+_PyDict_SendEvent(int watcher_bits,
+                  PyDict_WatchEvent event,
+                  PyDictObject *mp,
+                  PyObject *key,
+                  PyObject *value);
+
+static inline uint64_t
+_PyDict_NotifyEvent(PyInterpreterState *interp,
+                    PyDict_WatchEvent event,
+                    PyDictObject *mp,
+                    PyObject *key,
+                    PyObject *value)
+{
+    assert(Py_REFCNT((PyObject*)mp) > 0);
+    int watcher_bits = mp->ma_version_tag & DICT_WATCHER_MASK;
+    if (watcher_bits) {
+        RARE_EVENT_STAT_INC(watched_dict_modification);
+        _PyDict_SendEvent(watcher_bits, event, mp, key, value);
+    }
+    return DICT_NEXT_VERSION(interp) | (mp->ma_version_tag & DICT_WATCHER_AND_MODIFICATION_MASK);
+}
+
+extern PyDictObject *_PyObject_MaterializeManagedDict(PyObject *obj);
+
+PyAPI_FUNC(PyObject *)_PyDict_FromItems(
+        PyObject *const *keys, Py_ssize_t keys_offset,
+        PyObject *const *values, Py_ssize_t values_offset,
+        Py_ssize_t length);
+
+static inline uint8_t *
+get_insertion_order_array(PyDictValues *values)
+{
+    return (uint8_t *)&values->values[values->capacity];
+}
+
+static inline void
+_PyDictValues_AddToInsertionOrder(PyDictValues *values, Py_ssize_t ix)
+{
+    assert(ix < SHARED_KEYS_MAX_SIZE);
+    int size = values->size;
+    uint8_t *array = get_insertion_order_array(values);
+    assert(size < values->capacity);
+    assert(((uint8_t)ix) == ix);
+    array[size] = (uint8_t)ix;
+    values->size = size+1;
+}
+
+static inline size_t
+shared_keys_usable_size(PyDictKeysObject *keys)
+{
+    // dk_usable will decrease for each instance that is created and each
+    // value that is added.  dk_nentries will increase for each value that
+    // is added.  We want to always return the right value or larger.
+    // We therefore increase dk_nentries first and we decrease dk_usable
+    // second, and conversely here we read dk_usable first and dk_entries
+    // second (to avoid the case where we read entries before the increment
+    // and read usable after the decrement)
+    Py_ssize_t dk_usable = FT_ATOMIC_LOAD_SSIZE_ACQUIRE(keys->dk_usable);
+    Py_ssize_t dk_nentries = FT_ATOMIC_LOAD_SSIZE_ACQUIRE(keys->dk_nentries);
+    return dk_nentries + dk_usable;
+}
+
+static inline size_t
+_PyInlineValuesSize(PyTypeObject *tp)
+{
+    PyDictKeysObject *keys = ((PyHeapTypeObject*)tp)->ht_cached_keys;
+    assert(keys != NULL);
+    size_t size = shared_keys_usable_size(keys);
+    size_t prefix_size = _Py_SIZE_ROUND_UP(size, sizeof(PyObject *));
+    assert(prefix_size < 256);
+    return prefix_size + (size + 1) * sizeof(PyObject *);
+}
+
+int
+_PyDict_DetachFromObject(PyDictObject *dict, PyObject *obj);
+
+PyDictObject *_PyObject_MaterializeManagedDict_LockHeld(PyObject *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_DICT_H */
diff --git a/Include/internal/pycore_dict_state.h b/Include/internal/pycore_dict_state.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a44755c7a01a3a34f908220ef165173ecf18570
--- /dev/null
+++ b/Include/internal/pycore_dict_state.h
@@ -0,0 +1,32 @@
+#ifndef Py_INTERNAL_DICT_STATE_H
+#define Py_INTERNAL_DICT_STATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#define DICT_MAX_WATCHERS 8
+#define DICT_WATCHED_MUTATION_BITS 4
+
+struct _Py_dict_state {
+    /*Global counter used to set ma_version_tag field of dictionary.
+     * It is incremented each time that a dictionary is created and each
+     * time that a dictionary is modified. */
+    uint64_t global_version;
+    uint32_t next_keys_version;
+    PyDict_WatchCallback watchers[DICT_MAX_WATCHERS];
+};
+
+#define _dict_state_INIT \
+    { \
+        .next_keys_version = 2, \
+    }
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_DICT_STATE_H */
diff --git a/Include/internal/pycore_dtoa.h b/Include/internal/pycore_dtoa.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4222c5267d6bec25a2e0c954f731963f2b9ac1b
--- /dev/null
+++ b/Include/internal/pycore_dtoa.h
@@ -0,0 +1,75 @@
+#ifndef Py_INTERNAL_DTOA_H
+#define Py_INTERNAL_DTOA_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_pymath.h"        // _PY_SHORT_FLOAT_REPR
+
+
+typedef uint32_t ULong;
+
+struct
+Bigint {
+    struct Bigint *next;
+    int k, maxwds, sign, wds;
+    ULong x[1];
+};
+
+#if defined(Py_USING_MEMORY_DEBUGGER) || _PY_SHORT_FLOAT_REPR == 0
+
+struct _dtoa_state {
+    int _not_used;
+};
+#define _dtoa_state_INIT(INTERP) \
+    {0}
+
+#else  // !Py_USING_MEMORY_DEBUGGER && _PY_SHORT_FLOAT_REPR != 0
+
+/* The size of the Bigint freelist */
+#define Bigint_Kmax 7
+
+/* The size of the cached powers of 5 array */
+#define Bigint_Pow5size 8
+
+#ifndef PRIVATE_MEM
+#define PRIVATE_MEM 2304
+#endif
+#define Bigint_PREALLOC_SIZE \
+    ((PRIVATE_MEM+sizeof(double)-1)/sizeof(double))
+
+struct _dtoa_state {
+    // p5s is an array of powers of 5 of the form:
+    // 5**(2**(i+2)) for 0 <= i < Bigint_Pow5size
+    struct Bigint *p5s[Bigint_Pow5size];
+    // XXX This should be freed during runtime fini.
+    struct Bigint *freelist[Bigint_Kmax+1];
+    double preallocated[Bigint_PREALLOC_SIZE];
+    double *preallocated_next;
+};
+#define _dtoa_state_INIT(INTERP) \
+    { \
+        .preallocated_next = (INTERP)->dtoa.preallocated, \
+    }
+
+#endif  // !Py_USING_MEMORY_DEBUGGER
+
+
+extern double _Py_dg_strtod(const char *str, char **ptr);
+extern char* _Py_dg_dtoa(double d, int mode, int ndigits,
+                         int *decpt, int *sign, char **rve);
+extern void _Py_dg_freedtoa(char *s);
+
+
+extern PyStatus _PyDtoa_Init(PyInterpreterState *interp);
+extern void _PyDtoa_Fini(PyInterpreterState *interp);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_DTOA_H */
diff --git a/Include/internal/pycore_emscripten_signal.h b/Include/internal/pycore_emscripten_signal.h
new file mode 100644
index 0000000000000000000000000000000000000000..754193e21dec5a5608200d4302bbfc6c6b1e7eb1
--- /dev/null
+++ b/Include/internal/pycore_emscripten_signal.h
@@ -0,0 +1,30 @@
+#ifndef Py_EMSCRIPTEN_SIGNAL_H
+#define Py_EMSCRIPTEN_SIGNAL_H
+
+#if defined(__EMSCRIPTEN__)
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+void
+_Py_CheckEmscriptenSignals(void);
+
+void
+_Py_CheckEmscriptenSignalsPeriodically(void);
+
+#define _Py_CHECK_EMSCRIPTEN_SIGNALS() _Py_CheckEmscriptenSignals()
+
+#define _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY() _Py_CheckEmscriptenSignalsPeriodically()
+
+extern int Py_EMSCRIPTEN_SIGNAL_HANDLING;
+extern int _Py_emscripten_signal_clock;
+
+#else
+
+#define _Py_CHECK_EMSCRIPTEN_SIGNALS()
+#define _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY()
+
+#endif // defined(__EMSCRIPTEN__)
+
+#endif // ndef Py_EMSCRIPTEN_SIGNAL_H
diff --git a/Include/internal/pycore_emscripten_trampoline.h b/Include/internal/pycore_emscripten_trampoline.h
new file mode 100644
index 0000000000000000000000000000000000000000..e519c99ad86ccefcca28aebe3368c079d61fcce1
--- /dev/null
+++ b/Include/internal/pycore_emscripten_trampoline.h
@@ -0,0 +1,81 @@
+#ifndef Py_EMSCRIPTEN_TRAMPOLINE_H
+#define Py_EMSCRIPTEN_TRAMPOLINE_H
+
+#include "pycore_runtime.h"  // _PyRuntimeState
+
+/**
+ * C function call trampolines to mitigate bad function pointer casts.
+ *
+ * Section 6.3.2.3, paragraph 8 reads:
+ *
+ *      A pointer to a function of one type may be converted to a pointer to a
+ *      function of another type and back again; the result shall compare equal to
+ *      the original pointer. If a converted pointer is used to call a function
+ *      whose type is not compatible with the pointed-to type, the behavior is
+ *      undefined.
+ *
+ * Typical native ABIs ignore additional arguments or fill in missing values
+ * with 0/NULL in function pointer cast. Compilers do not show warnings when a
+ * function pointer is explicitly casted to an incompatible type.
+ *
+ * Bad fpcasts are an issue in WebAssembly. WASM's indirect_call has strict
+ * function signature checks. Argument count, types, and return type must match.
+ *
+ * Third party code unintentionally rely on problematic fpcasts. The call
+ * trampoline mitigates common occurrences of bad fpcasts on Emscripten.
+ */
+
+#if defined(__EMSCRIPTEN__) && defined(PY_CALL_TRAMPOLINE)
+
+void _Py_EmscriptenTrampoline_Init(_PyRuntimeState *runtime);
+
+PyObject*
+_PyEM_TrampolineCall_JavaScript(PyCFunctionWithKeywords func,
+                                PyObject* self,
+                                PyObject* args,
+                                PyObject* kw);
+
+PyObject*
+_PyEM_TrampolineCall_Reflection(PyCFunctionWithKeywords func,
+                                PyObject* self,
+                                PyObject* args,
+                                PyObject* kw);
+
+#define _PyEM_TrampolineCall(meth, self, args, kw) \
+    ((_PyRuntime.wasm_type_reflection_available) ? \
+        (_PyEM_TrampolineCall_Reflection((PyCFunctionWithKeywords)(meth), (self), (args), (kw))) : \
+        (_PyEM_TrampolineCall_JavaScript((PyCFunctionWithKeywords)(meth), (self), (args), (kw))))
+
+#define _PyCFunction_TrampolineCall(meth, self, args) \
+    _PyEM_TrampolineCall( \
+        (*(PyCFunctionWithKeywords)(void(*)(void))(meth)), (self), (args), NULL)
+
+#define _PyCFunctionWithKeywords_TrampolineCall(meth, self, args, kw) \
+    _PyEM_TrampolineCall((meth), (self), (args), (kw))
+
+#define descr_set_trampoline_call(set, obj, value, closure) \
+    ((int)_PyEM_TrampolineCall((PyCFunctionWithKeywords)(set), (obj), (value), (PyObject*)(closure)))
+
+#define descr_get_trampoline_call(get, obj, closure) \
+    _PyEM_TrampolineCall((PyCFunctionWithKeywords)(get), (obj), (PyObject*)(closure), NULL)
+
+
+#else // defined(__EMSCRIPTEN__) && defined(PY_CALL_TRAMPOLINE)
+
+#define _Py_EmscriptenTrampoline_Init(runtime)
+
+#define _PyCFunction_TrampolineCall(meth, self, args) \
+    (meth)((self), (args))
+
+#define _PyCFunctionWithKeywords_TrampolineCall(meth, self, args, kw) \
+    (meth)((self), (args), (kw))
+
+#define descr_set_trampoline_call(set, obj, value, closure) \
+    (set)((obj), (value), (closure))
+
+#define descr_get_trampoline_call(get, obj, closure) \
+    (get)((obj), (closure))
+
+#endif // defined(__EMSCRIPTEN__) && defined(PY_CALL_TRAMPOLINE)
+
+#endif // ndef Py_EMSCRIPTEN_SIGNAL_H
diff --git a/Include/internal/pycore_exceptions.h b/Include/internal/pycore_exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..26456d1966bbb0eb6a7becedabe8054d49d8dac7
--- /dev/null
+++ b/Include/internal/pycore_exceptions.h
@@ -0,0 +1,40 @@
+#ifndef Py_INTERNAL_EXCEPTIONS_H
+#define Py_INTERNAL_EXCEPTIONS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* runtime lifecycle */
+
+extern PyStatus _PyExc_InitState(PyInterpreterState *);
+extern PyStatus _PyExc_InitGlobalObjects(PyInterpreterState *);
+extern int _PyExc_InitTypes(PyInterpreterState *);
+extern void _PyExc_Fini(PyInterpreterState *);
+
+
+/* other API */
+
+struct _Py_exc_state {
+    // The dict mapping from errno codes to OSError subclasses
+    PyObject *errnomap;
+    PyBaseExceptionObject *memerrors_freelist;
+    int memerrors_numfree;
+#ifdef Py_GIL_DISABLED
+    PyMutex memerrors_lock;
+#endif
+    // The ExceptionGroup type
+    PyObject *PyExc_ExceptionGroup;
+};
+
+extern void _PyExc_ClearExceptionGroupType(PyInterpreterState *);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_EXCEPTIONS_H */
diff --git a/Include/internal/pycore_faulthandler.h b/Include/internal/pycore_faulthandler.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dd7d8d7ca9792e200d43c02867fb0bd59f12c53
--- /dev/null
+++ b/Include/internal/pycore_faulthandler.h
@@ -0,0 +1,99 @@
+#ifndef Py_INTERNAL_FAULTHANDLER_H
+#define Py_INTERNAL_FAULTHANDLER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef HAVE_SIGACTION
+#  include <signal.h>             // sigaction
+#endif
+
+
+#ifndef MS_WINDOWS
+   /* register() is useless on Windows, because only SIGSEGV, SIGABRT and
+      SIGILL can be handled by the process, and these signals can only be used
+      with enable(), not using register() */
+#  define FAULTHANDLER_USER
+#endif
+
+
+#ifdef HAVE_SIGACTION
+/* Using an alternative stack requires sigaltstack()
+   and sigaction() SA_ONSTACK */
+#  ifdef HAVE_SIGALTSTACK
+#    define FAULTHANDLER_USE_ALT_STACK
+#  endif
+typedef struct sigaction _Py_sighandler_t;
+#else
+typedef PyOS_sighandler_t _Py_sighandler_t;
+#endif  // HAVE_SIGACTION
+
+
+#ifdef FAULTHANDLER_USER
+struct faulthandler_user_signal {
+    int enabled;
+    PyObject *file;
+    int fd;
+    int all_threads;
+    int chain;
+    _Py_sighandler_t previous;
+    PyInterpreterState *interp;
+};
+#endif /* FAULTHANDLER_USER */
+
+
+struct _faulthandler_runtime_state {
+    struct {
+        int enabled;
+        PyObject *file;
+        int fd;
+        int all_threads;
+        PyInterpreterState *interp;
+#ifdef MS_WINDOWS
+        void *exc_handler;
+#endif
+    } fatal_error;
+
+    struct {
+        PyObject *file;
+        int fd;
+        PY_TIMEOUT_T timeout_us;   /* timeout in microseconds */
+        int repeat;
+        PyInterpreterState *interp;
+        int exit;
+        char *header;
+        size_t header_len;
+        /* The main thread always holds this lock. It is only released when
+           faulthandler_thread() is interrupted before this thread exits, or at
+           Python exit. */
+        PyThread_type_lock cancel_event;
+        /* released by child thread when joined */
+        PyThread_type_lock running;
+    } thread;
+
+#ifdef FAULTHANDLER_USER
+    struct faulthandler_user_signal *user_signals;
+#endif
+
+#ifdef FAULTHANDLER_USE_ALT_STACK
+    stack_t stack;
+    stack_t old_stack;
+#endif
+};
+
+#define _faulthandler_runtime_state_INIT \
+    { \
+        .fatal_error = { \
+            .fd = -1, \
+        }, \
+    }
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FAULTHANDLER_H */
diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h
new file mode 100644
index 0000000000000000000000000000000000000000..13f86b01bbfe8ff80069f16b47ad89d2ffc28f7d
--- /dev/null
+++ b/Include/internal/pycore_fileutils.h
@@ -0,0 +1,335 @@
+#ifndef Py_INTERNAL_FILEUTILS_H
+#define Py_INTERNAL_FILEUTILS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <locale.h>               // struct lconv
+
+
+/* A routine to check if a file descriptor can be select()-ed. */
+#ifdef _MSC_VER
+    /* On Windows, any socket fd can be select()-ed, no matter how high */
+    #define _PyIsSelectable_fd(FD) (1)
+#else
+    #define _PyIsSelectable_fd(FD) ((unsigned int)(FD) < (unsigned int)FD_SETSIZE)
+#endif
+
+struct _fileutils_state {
+    int force_ascii;
+};
+
+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_STRICT,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_BACKSLASHREPLACE,
+    _Py_ERROR_SURROGATEPASS,
+    _Py_ERROR_XMLCHARREFREPLACE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors);
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(int) _Py_DecodeLocaleEx(
+    const char *arg,
+    wchar_t **wstr,
+    size_t *wlen,
+    const char **reason,
+    int current_locale,
+    _Py_error_handler errors);
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(int) _Py_EncodeLocaleEx(
+    const wchar_t *text,
+    char **str,
+    size_t *error_pos,
+    const char **reason,
+    int current_locale,
+    _Py_error_handler errors);
+
+extern char* _Py_EncodeLocaleRaw(
+    const wchar_t *text,
+    size_t *error_pos);
+
+extern PyObject* _Py_device_encoding(int);
+
+#if defined(MS_WINDOWS) || defined(__APPLE__)
+    /* On Windows, the count parameter of read() is an int (bpo-9015, bpo-9611).
+       On macOS 10.13, read() and write() with more than INT_MAX bytes
+       fail with EINVAL (bpo-24658). */
+#   define _PY_READ_MAX  INT_MAX
+#   define _PY_WRITE_MAX INT_MAX
+#else
+    /* write() should truncate the input to PY_SSIZE_T_MAX bytes,
+       but it's safer to do it ourself to have a portable behaviour */
+#   define _PY_READ_MAX  PY_SSIZE_T_MAX
+#   define _PY_WRITE_MAX PY_SSIZE_T_MAX
+#endif
+
+#ifdef MS_WINDOWS
+struct _Py_stat_struct {
+    uint64_t st_dev;
+    uint64_t st_ino;
+    unsigned short st_mode;
+    int st_nlink;
+    int st_uid;
+    int st_gid;
+    unsigned long st_rdev;
+    __int64 st_size;
+    time_t st_atime;
+    int st_atime_nsec;
+    time_t st_mtime;
+    int st_mtime_nsec;
+    time_t st_ctime;
+    int st_ctime_nsec;
+    time_t st_birthtime;
+    int st_birthtime_nsec;
+    unsigned long st_file_attributes;
+    unsigned long st_reparse_tag;
+    uint64_t st_ino_high;
+};
+#else
+#  define _Py_stat_struct stat
+#endif
+
+// Export for 'mmap' shared extension
+PyAPI_FUNC(int) _Py_fstat(
+    int fd,
+    struct _Py_stat_struct *status);
+
+// Export for 'mmap' shared extension
+PyAPI_FUNC(int) _Py_fstat_noraise(
+    int fd,
+    struct _Py_stat_struct *status);
+
+// Export for '_tkinter' shared extension
+PyAPI_FUNC(int) _Py_stat(
+    PyObject *path,
+    struct stat *status);
+
+// Export for 'select' shared extension (Solaris newDevPollObject())
+PyAPI_FUNC(int) _Py_open(
+    const char *pathname,
+    int flags);
+
+// Export for '_posixsubprocess' shared extension
+PyAPI_FUNC(int) _Py_open_noraise(
+    const char *pathname,
+    int flags);
+
+extern FILE* _Py_wfopen(
+    const wchar_t *path,
+    const wchar_t *mode);
+
+extern Py_ssize_t _Py_read(
+    int fd,
+    void *buf,
+    size_t count);
+
+// Export for 'select' shared extension (Solaris devpoll_flush())
+PyAPI_FUNC(Py_ssize_t) _Py_write(
+    int fd,
+    const void *buf,
+    size_t count);
+
+// Export for '_posixsubprocess' shared extension
+PyAPI_FUNC(Py_ssize_t) _Py_write_noraise(
+    int fd,
+    const void *buf,
+    size_t count);
+
+#ifdef HAVE_READLINK
+extern int _Py_wreadlink(
+    const wchar_t *path,
+    wchar_t *buf,
+    /* Number of characters of 'buf' buffer
+       including the trailing NUL character */
+    size_t buflen);
+#endif
+
+#ifdef HAVE_REALPATH
+extern wchar_t* _Py_wrealpath(
+    const wchar_t *path,
+    wchar_t *resolved_path,
+    /* Number of characters of 'resolved_path' buffer
+       including the trailing NUL character */
+    size_t resolved_path_len);
+#endif
+
+extern wchar_t* _Py_wgetcwd(
+    wchar_t *buf,
+    /* Number of characters of 'buf' buffer
+       including the trailing NUL character */
+    size_t buflen);
+
+extern int _Py_get_inheritable(int fd);
+
+// Export for '_socket' shared extension
+PyAPI_FUNC(int) _Py_set_inheritable(int fd, int inheritable,
+                                    int *atomic_flag_works);
+
+// Export for '_posixsubprocess' shared extension
+PyAPI_FUNC(int) _Py_set_inheritable_async_safe(int fd, int inheritable,
+                                               int *atomic_flag_works);
+
+// Export for '_socket' shared extension
+PyAPI_FUNC(int) _Py_dup(int fd);
+
+extern int _Py_get_blocking(int fd);
+
+extern int _Py_set_blocking(int fd, int blocking);
+
+#ifdef MS_WINDOWS
+extern void* _Py_get_osfhandle_noraise(int fd);
+
+// Export for '_testconsole' shared extension
+PyAPI_FUNC(void*) _Py_get_osfhandle(int fd);
+
+extern int _Py_open_osfhandle_noraise(void *handle, int flags);
+
+extern int _Py_open_osfhandle(void *handle, int flags);
+#endif  /* MS_WINDOWS */
+
+// This is used after getting NULL back from Py_DecodeLocale().
+#define DECODE_LOCALE_ERR(NAME, LEN) \
+    ((LEN) == (size_t)-2) \
+     ? _PyStatus_ERR("cannot decode " NAME) \
+     : _PyStatus_NO_MEMORY()
+
+extern int _Py_HasFileSystemDefaultEncodeErrors;
+
+extern int _Py_DecodeUTF8Ex(
+    const char *arg,
+    Py_ssize_t arglen,
+    wchar_t **wstr,
+    size_t *wlen,
+    const char **reason,
+    _Py_error_handler errors);
+
+extern int _Py_EncodeUTF8Ex(
+    const wchar_t *text,
+    char **str,
+    size_t *error_pos,
+    const char **reason,
+    int raw_malloc,
+    _Py_error_handler errors);
+
+extern wchar_t* _Py_DecodeUTF8_surrogateescape(
+    const char *arg,
+    Py_ssize_t arglen,
+    size_t *wlen);
+
+extern int
+_Py_wstat(const wchar_t *, struct stat *);
+
+extern int _Py_GetForceASCII(void);
+
+/* Reset "force ASCII" mode (if it was initialized).
+
+   This function should be called when Python changes the LC_CTYPE locale,
+   so the "force ASCII" mode can be detected again on the new locale
+   encoding. */
+extern void _Py_ResetForceASCII(void);
+
+
+extern int _Py_GetLocaleconvNumeric(
+    struct lconv *lc,
+    PyObject **decimal_point,
+    PyObject **thousands_sep);
+
+// Export for '_posixsubprocess' (on macOS)
+PyAPI_FUNC(void) _Py_closerange(int first, int last);
+
+extern wchar_t* _Py_GetLocaleEncoding(void);
+extern PyObject* _Py_GetLocaleEncodingObject(void);
+
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+extern int _Py_LocaleUsesNonUnicodeWchar(void);
+
+extern wchar_t* _Py_DecodeNonUnicodeWchar(
+    const wchar_t* native,
+    Py_ssize_t size);
+
+extern int _Py_EncodeNonUnicodeWchar_InPlace(
+    wchar_t* unicode,
+    Py_ssize_t size);
+#endif
+
+extern int _Py_isabs(const wchar_t *path);
+extern int _Py_abspath(const wchar_t *path, wchar_t **abspath_p);
+#ifdef MS_WINDOWS
+extern int _PyOS_getfullpathname(const wchar_t *path, wchar_t **abspath_p);
+#endif
+extern wchar_t* _Py_join_relfile(const wchar_t *dirname,
+                                 const wchar_t *relfile);
+extern int _Py_add_relfile(wchar_t *dirname,
+                           const wchar_t *relfile,
+                           size_t bufsize);
+extern size_t _Py_find_basename(const wchar_t *filename);
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(wchar_t*) _Py_normpath(wchar_t *path, Py_ssize_t size);
+
+extern wchar_t *_Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t *length);
+
+// The Windows Games API family does not provide these functions
+// so provide our own implementations. Remove them in case they get added
+// to the Games API family
+#if defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP)
+#include <winerror.h>             // HRESULT
+
+extern HRESULT PathCchSkipRoot(const wchar_t *pszPath, const wchar_t **ppszRootEnd);
+#endif /* defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP) */
+
+extern void _Py_skiproot(const wchar_t *path, Py_ssize_t size, Py_ssize_t *drvsize, Py_ssize_t *rootsize);
+
+// Macros to protect CRT calls against instant termination when passed an
+// invalid parameter (bpo-23524). IPH stands for Invalid Parameter Handler.
+// Usage:
+//
+//      _Py_BEGIN_SUPPRESS_IPH
+//      ...
+//      _Py_END_SUPPRESS_IPH
+#if defined _MSC_VER && _MSC_VER >= 1900
+
+#  include <stdlib.h>   // _set_thread_local_invalid_parameter_handler()
+
+   extern _invalid_parameter_handler _Py_silent_invalid_parameter_handler;
+#  define _Py_BEGIN_SUPPRESS_IPH \
+    { _invalid_parameter_handler _Py_old_handler = \
+      _set_thread_local_invalid_parameter_handler(_Py_silent_invalid_parameter_handler);
+#  define _Py_END_SUPPRESS_IPH \
+    _set_thread_local_invalid_parameter_handler(_Py_old_handler); }
+#else
+#  define _Py_BEGIN_SUPPRESS_IPH
+#  define _Py_END_SUPPRESS_IPH
+#endif /* _MSC_VER >= 1900 */
+
+// Export for 'select' shared extension (Argument Clinic code)
+PyAPI_FUNC(int) _PyLong_FileDescriptor_Converter(PyObject *, void *);
+
+// Export for test_peg_generator
+PyAPI_FUNC(char*) _Py_UniversalNewlineFgetsWithSize(char *, int, FILE*, PyObject *, size_t*);
+
+extern int _PyFile_Flush(PyObject *);
+
+#ifndef MS_WINDOWS
+extern int _Py_GetTicksPerSecond(long *ticks_per_second);
+#endif
+
+// Export for '_testcapi' shared extension
+PyAPI_FUNC(int) _Py_IsValidFD(int fd);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FILEUTILS_H */
diff --git a/Include/internal/pycore_fileutils_windows.h b/Include/internal/pycore_fileutils_windows.h
new file mode 100644
index 0000000000000000000000000000000000000000..b79aa9fb4653765cb7f171ade2b1570b5c451e9d
--- /dev/null
+++ b/Include/internal/pycore_fileutils_windows.h
@@ -0,0 +1,98 @@
+#ifndef Py_INTERNAL_FILEUTILS_WINDOWS_H
+#define Py_INTERNAL_FILEUTILS_WINDOWS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef MS_WINDOWS
+
+#if !defined(NTDDI_WIN10_NI) || !(NTDDI_VERSION >= NTDDI_WIN10_NI)
+typedef struct _FILE_STAT_BASIC_INFORMATION {
+    LARGE_INTEGER FileId;
+    LARGE_INTEGER CreationTime;
+    LARGE_INTEGER LastAccessTime;
+    LARGE_INTEGER LastWriteTime;
+    LARGE_INTEGER ChangeTime;
+    LARGE_INTEGER AllocationSize;
+    LARGE_INTEGER EndOfFile;
+    ULONG FileAttributes;
+    ULONG ReparseTag;
+    ULONG NumberOfLinks;
+    ULONG DeviceType;
+    ULONG DeviceCharacteristics;
+    ULONG Reserved;
+    LARGE_INTEGER VolumeSerialNumber;
+    FILE_ID_128 FileId128;
+} FILE_STAT_BASIC_INFORMATION;
+
+typedef enum _FILE_INFO_BY_NAME_CLASS {
+    FileStatByNameInfo,
+    FileStatLxByNameInfo,
+    FileCaseSensitiveByNameInfo,
+    FileStatBasicByNameInfo,
+    MaximumFileInfoByNameClass
+} FILE_INFO_BY_NAME_CLASS;
+#endif
+
+typedef BOOL (WINAPI *PGetFileInformationByName)(
+    PCWSTR FileName,
+    FILE_INFO_BY_NAME_CLASS FileInformationClass,
+    PVOID FileInfoBuffer,
+    ULONG FileInfoBufferSize
+);
+
+static inline BOOL _Py_GetFileInformationByName(
+    PCWSTR FileName,
+    FILE_INFO_BY_NAME_CLASS FileInformationClass,
+    PVOID FileInfoBuffer,
+    ULONG FileInfoBufferSize
+) {
+    static PGetFileInformationByName GetFileInformationByName = NULL;
+    static int GetFileInformationByName_init = -1;
+
+    if (GetFileInformationByName_init < 0) {
+        HMODULE hMod = LoadLibraryW(L"api-ms-win-core-file-l2-1-4");
+        GetFileInformationByName_init = 0;
+        if (hMod) {
+            GetFileInformationByName = (PGetFileInformationByName)GetProcAddress(
+                hMod, "GetFileInformationByName");
+            if (GetFileInformationByName) {
+                GetFileInformationByName_init = 1;
+            } else {
+                FreeLibrary(hMod);
+            }
+        }
+    }
+
+    if (GetFileInformationByName_init <= 0) {
+        SetLastError(ERROR_NOT_SUPPORTED);
+        return FALSE;
+    }
+    return GetFileInformationByName(FileName, FileInformationClass, FileInfoBuffer, FileInfoBufferSize);
+}
+
+static inline BOOL _Py_GetFileInformationByName_ErrorIsTrustworthy(int error)
+{
+    switch(error) {
+        case ERROR_FILE_NOT_FOUND:
+        case ERROR_PATH_NOT_FOUND:
+        case ERROR_NOT_READY:
+        case ERROR_BAD_NET_NAME:
+        case ERROR_BAD_NETPATH:
+        case ERROR_BAD_PATHNAME:
+        case ERROR_INVALID_NAME:
+        case ERROR_FILENAME_EXCED_RANGE:
+            return TRUE;
+        case ERROR_NOT_SUPPORTED:
+            return FALSE;
+    }
+    return FALSE;
+}
+
+#endif
+
+#endif
diff --git a/Include/internal/pycore_floatobject.h b/Include/internal/pycore_floatobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..f984df695696c3e6ffdf75a39fa11376499d3cca
--- /dev/null
+++ b/Include/internal/pycore_floatobject.h
@@ -0,0 +1,62 @@
+#ifndef Py_INTERNAL_FLOATOBJECT_H
+#define Py_INTERNAL_FLOATOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_freelist.h"      // _PyFreeListState
+#include "pycore_unicodeobject.h" // _PyUnicodeWriter
+
+/* runtime lifecycle */
+
+extern void _PyFloat_InitState(PyInterpreterState *);
+extern PyStatus _PyFloat_InitTypes(PyInterpreterState *);
+extern void _PyFloat_FiniType(PyInterpreterState *);
+
+
+/* other API */
+
+enum _py_float_format_type {
+    _py_float_format_unknown,
+    _py_float_format_ieee_big_endian,
+    _py_float_format_ieee_little_endian,
+};
+
+struct _Py_float_runtime_state {
+    enum _py_float_format_type float_format;
+    enum _py_float_format_type double_format;
+};
+
+
+
+
+PyAPI_FUNC(void) _PyFloat_ExactDealloc(PyObject *op);
+
+
+extern void _PyFloat_DebugMallocStats(FILE* out);
+
+
+/* Format the object based on the format_spec, as defined in PEP 3101
+   (Advanced String Formatting). */
+extern int _PyFloat_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
+
+extern PyObject* _Py_string_to_number_with_underscores(
+    const char *str, Py_ssize_t len, const char *what, PyObject *obj, void *arg,
+    PyObject *(*innerfunc)(const char *, Py_ssize_t, void *));
+
+extern double _Py_parse_inf_or_nan(const char *p, char **endptr);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FLOATOBJECT_H */
diff --git a/Include/internal/pycore_flowgraph.h b/Include/internal/pycore_flowgraph.h
new file mode 100644
index 0000000000000000000000000000000000000000..819117b83114bcd5cf7d6d64fb04f5a9e07b6b40
--- /dev/null
+++ b/Include/internal/pycore_flowgraph.h
@@ -0,0 +1,40 @@
+#ifndef Py_INTERNAL_CFG_H
+#define Py_INTERNAL_CFG_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_compile.h"
+#include "pycore_instruction_sequence.h"
+#include "pycore_opcode_utils.h"
+
+struct _PyCfgBuilder;
+
+int _PyCfgBuilder_UseLabel(struct _PyCfgBuilder *g, _PyJumpTargetLabel lbl);
+int _PyCfgBuilder_Addop(struct _PyCfgBuilder *g, int opcode, int oparg, _Py_SourceLocation loc);
+
+struct _PyCfgBuilder* _PyCfgBuilder_New(void);
+void _PyCfgBuilder_Free(struct _PyCfgBuilder *g);
+int _PyCfgBuilder_CheckSize(struct _PyCfgBuilder* g);
+
+int _PyCfg_OptimizeCodeUnit(struct _PyCfgBuilder *g, PyObject *consts, PyObject *const_cache,
+                            int nlocals, int nparams, int firstlineno);
+
+int _PyCfg_ToInstructionSequence(struct _PyCfgBuilder *g, _PyInstructionSequence *seq);
+int _PyCfg_OptimizedCfgToInstructionSequence(struct _PyCfgBuilder *g, _PyCompile_CodeUnitMetadata *umd,
+                                             int code_flags, int *stackdepth, int *nlocalsplus,
+                                             _PyInstructionSequence *seq);
+
+PyCodeObject *
+_PyAssemble_MakeCodeObject(_PyCompile_CodeUnitMetadata *u, PyObject *const_cache,
+                           PyObject *consts, int maxdepth, _PyInstructionSequence *instrs,
+                           int nlocalsplus, int code_flags, PyObject *filename);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CFG_H */
diff --git a/Include/internal/pycore_format.h b/Include/internal/pycore_format.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b8d57539ca505fbc56ecb2245785a8b37b85c9f
--- /dev/null
+++ b/Include/internal/pycore_format.h
@@ -0,0 +1,27 @@
+#ifndef Py_INTERNAL_FORMAT_H
+#define Py_INTERNAL_FORMAT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* Format codes
+ * F_LJUST      '-'
+ * F_SIGN       '+'
+ * F_BLANK      ' '
+ * F_ALT        '#'
+ * F_ZERO       '0'
+ */
+#define F_LJUST (1<<0)
+#define F_SIGN  (1<<1)
+#define F_BLANK (1<<2)
+#define F_ALT   (1<<3)
+#define F_ZERO  (1<<4)
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FORMAT_H */
diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h
new file mode 100644
index 0000000000000000000000000000000000000000..af181e3760d231c626df1ec36b966c276483dc7d
--- /dev/null
+++ b/Include/internal/pycore_frame.h
@@ -0,0 +1,330 @@
+#ifndef Py_INTERNAL_FRAME_H
+#define Py_INTERNAL_FRAME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <stdbool.h>
+#include <stddef.h>               // offsetof()
+#include "pycore_code.h"          // STATS
+
+/* See Objects/frame_layout.md for an explanation of the frame stack
+ * including explanation of the PyFrameObject and _PyInterpreterFrame
+ * structs. */
+
+
+struct _frame {
+    PyObject_HEAD
+    PyFrameObject *f_back;      /* previous frame, or NULL */
+    struct _PyInterpreterFrame *f_frame; /* points to the frame data */
+    PyObject *f_trace;          /* Trace function */
+    int f_lineno;               /* Current line number. Only valid if non-zero */
+    char f_trace_lines;         /* Emit per-line trace events? */
+    char f_trace_opcodes;       /* Emit per-opcode trace events? */
+    PyObject *f_extra_locals;   /* Dict for locals set by users using f_locals, could be NULL */
+    /* This is purely for backwards compatibility for PyEval_GetLocals.
+       PyEval_GetLocals requires a borrowed reference so the actual reference
+       is stored here */
+    PyObject *f_locals_cache;
+    /* The frame data, if this frame object owns the frame */
+    PyObject *_f_frame_data[1];
+};
+
+extern PyFrameObject* _PyFrame_New_NoTrack(PyCodeObject *code);
+
+
+/* other API */
+
+typedef enum _framestate {
+    FRAME_CREATED = -3,
+    FRAME_SUSPENDED = -2,
+    FRAME_SUSPENDED_YIELD_FROM = -1,
+    FRAME_EXECUTING = 0,
+    FRAME_COMPLETED = 1,
+    FRAME_CLEARED = 4
+} PyFrameState;
+
+#define FRAME_STATE_SUSPENDED(S) ((S) == FRAME_SUSPENDED || (S) == FRAME_SUSPENDED_YIELD_FROM)
+#define FRAME_STATE_FINISHED(S) ((S) >= FRAME_COMPLETED)
+
+enum _frameowner {
+    FRAME_OWNED_BY_THREAD = 0,
+    FRAME_OWNED_BY_GENERATOR = 1,
+    FRAME_OWNED_BY_FRAME_OBJECT = 2,
+    FRAME_OWNED_BY_CSTACK = 3,
+};
+
+typedef struct _PyInterpreterFrame {
+    PyObject *f_executable; /* Strong reference (code object or None) */
+    struct _PyInterpreterFrame *previous;
+    PyObject *f_funcobj; /* Strong reference. Only valid if not on C stack */
+    PyObject *f_globals; /* Borrowed reference. Only valid if not on C stack */
+    PyObject *f_builtins; /* Borrowed reference. Only valid if not on C stack */
+    PyObject *f_locals; /* Strong reference, may be NULL. Only valid if not on C stack */
+    PyFrameObject *frame_obj; /* Strong reference, may be NULL. Only valid if not on C stack */
+    _Py_CODEUNIT *instr_ptr; /* Instruction currently executing (or about to begin) */
+    int stacktop;  /* Offset of TOS from localsplus  */
+    uint16_t return_offset;  /* Only relevant during a function call */
+    char owner;
+    /* Locals and stack */
+    PyObject *localsplus[1];
+} _PyInterpreterFrame;
+
+#define _PyInterpreterFrame_LASTI(IF) \
+    ((int)((IF)->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(IF))))
+
+static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) {
+    assert(PyCode_Check(f->f_executable));
+    return (PyCodeObject *)f->f_executable;
+}
+
+static inline PyObject **_PyFrame_Stackbase(_PyInterpreterFrame *f) {
+    return f->localsplus + _PyFrame_GetCode(f)->co_nlocalsplus;
+}
+
+static inline PyObject *_PyFrame_StackPeek(_PyInterpreterFrame *f) {
+    assert(f->stacktop > _PyFrame_GetCode(f)->co_nlocalsplus);
+    assert(f->localsplus[f->stacktop-1] != NULL);
+    return f->localsplus[f->stacktop-1];
+}
+
+static inline PyObject *_PyFrame_StackPop(_PyInterpreterFrame *f) {
+    assert(f->stacktop > _PyFrame_GetCode(f)->co_nlocalsplus);
+    f->stacktop--;
+    return f->localsplus[f->stacktop];
+}
+
+static inline void _PyFrame_StackPush(_PyInterpreterFrame *f, PyObject *value) {
+    f->localsplus[f->stacktop] = value;
+    f->stacktop++;
+}
+
+#define FRAME_SPECIALS_SIZE ((int)((sizeof(_PyInterpreterFrame)-1)/sizeof(PyObject *)))
+
+static inline int
+_PyFrame_NumSlotsForCodeObject(PyCodeObject *code)
+{
+    /* This function needs to remain in sync with the calculation of
+     * co_framesize in Tools/build/deepfreeze.py */
+    assert(code->co_framesize >= FRAME_SPECIALS_SIZE);
+    return code->co_framesize - FRAME_SPECIALS_SIZE;
+}
+
+static inline void _PyFrame_Copy(_PyInterpreterFrame *src, _PyInterpreterFrame *dest)
+{
+    assert(src->stacktop >= _PyFrame_GetCode(src)->co_nlocalsplus);
+    *dest = *src;
+    for (int i = 1; i < src->stacktop; i++) {
+        dest->localsplus[i] = src->localsplus[i];
+    }
+    // Don't leave a dangling pointer to the old frame when creating generators
+    // and coroutines:
+    dest->previous = NULL;
+}
+
+/* Consumes reference to func and locals.
+   Does not initialize frame->previous, which happens
+   when frame is linked into the frame stack.
+ */
+static inline void
+_PyFrame_Initialize(
+    _PyInterpreterFrame *frame, PyFunctionObject *func,
+    PyObject *locals, PyCodeObject *code, int null_locals_from)
+{
+    frame->f_funcobj = (PyObject *)func;
+    frame->f_executable = Py_NewRef(code);
+    frame->f_builtins = func->func_builtins;
+    frame->f_globals = func->func_globals;
+    frame->f_locals = locals;
+    frame->stacktop = code->co_nlocalsplus;
+    frame->frame_obj = NULL;
+    frame->instr_ptr = _PyCode_CODE(code);
+    frame->return_offset = 0;
+    frame->owner = FRAME_OWNED_BY_THREAD;
+
+    for (int i = null_locals_from; i < code->co_nlocalsplus; i++) {
+        frame->localsplus[i] = NULL;
+    }
+}
+
+/* Gets the pointer to the locals array
+ * that precedes this frame.
+ */
+static inline PyObject**
+_PyFrame_GetLocalsArray(_PyInterpreterFrame *frame)
+{
+    return frame->localsplus;
+}
+
+/* Fetches the stack pointer, and sets stacktop to -1.
+   Having stacktop <= 0 ensures that invalid
+   values are not visible to the cycle GC.
+   We choose -1 rather than 0 to assist debugging. */
+static inline PyObject**
+_PyFrame_GetStackPointer(_PyInterpreterFrame *frame)
+{
+    PyObject **sp = frame->localsplus + frame->stacktop;
+    frame->stacktop = -1;
+    return sp;
+}
+
+static inline void
+_PyFrame_SetStackPointer(_PyInterpreterFrame *frame, PyObject **stack_pointer)
+{
+    frame->stacktop = (int)(stack_pointer - frame->localsplus);
+}
+
+/* Determine whether a frame is incomplete.
+ * A frame is incomplete if it is part way through
+ * creating cell objects or a generator or coroutine.
+ *
+ * Frames on the frame stack are incomplete until the
+ * first RESUME instruction.
+ * Frames owned by a generator are always complete.
+ */
+static inline bool
+_PyFrame_IsIncomplete(_PyInterpreterFrame *frame)
+{
+    if (frame->owner == FRAME_OWNED_BY_CSTACK) {
+        return true;
+    }
+    return frame->owner != FRAME_OWNED_BY_GENERATOR &&
+        frame->instr_ptr < _PyCode_CODE(_PyFrame_GetCode(frame)) + _PyFrame_GetCode(frame)->_co_firsttraceable;
+}
+
+static inline _PyInterpreterFrame *
+_PyFrame_GetFirstComplete(_PyInterpreterFrame *frame)
+{
+    while (frame && _PyFrame_IsIncomplete(frame)) {
+        frame = frame->previous;
+    }
+    return frame;
+}
+
+static inline _PyInterpreterFrame *
+_PyThreadState_GetFrame(PyThreadState *tstate)
+{
+    return _PyFrame_GetFirstComplete(tstate->current_frame);
+}
+
+/* For use by _PyFrame_GetFrameObject
+  Do not call directly. */
+PyFrameObject *
+_PyFrame_MakeAndSetFrameObject(_PyInterpreterFrame *frame);
+
+/* Gets the PyFrameObject for this frame, lazily
+ * creating it if necessary.
+ * Returns a borrowed reference */
+static inline PyFrameObject *
+_PyFrame_GetFrameObject(_PyInterpreterFrame *frame)
+{
+
+    assert(!_PyFrame_IsIncomplete(frame));
+    PyFrameObject *res =  frame->frame_obj;
+    if (res != NULL) {
+        return res;
+    }
+    return _PyFrame_MakeAndSetFrameObject(frame);
+}
+
+void
+_PyFrame_ClearLocals(_PyInterpreterFrame *frame);
+
+/* Clears all references in the frame.
+ * If take is non-zero, then the _PyInterpreterFrame frame
+ * may be transferred to the frame object it references
+ * instead of being cleared. Either way
+ * the caller no longer owns the references
+ * in the frame.
+ * take should  be set to 1 for heap allocated
+ * frames like the ones in generators and coroutines.
+ */
+void
+_PyFrame_ClearExceptCode(_PyInterpreterFrame * frame);
+
+int
+_PyFrame_Traverse(_PyInterpreterFrame *frame, visitproc visit, void *arg);
+
+bool
+_PyFrame_HasHiddenLocals(_PyInterpreterFrame *frame);
+
+PyObject *
+_PyFrame_GetLocals(_PyInterpreterFrame *frame);
+
+static inline bool
+_PyThreadState_HasStackSpace(PyThreadState *tstate, int size)
+{
+    assert(
+        (tstate->datastack_top == NULL && tstate->datastack_limit == NULL)
+        ||
+        (tstate->datastack_top != NULL && tstate->datastack_limit != NULL)
+    );
+    return tstate->datastack_top != NULL &&
+        size < tstate->datastack_limit - tstate->datastack_top;
+}
+
+extern _PyInterpreterFrame *
+_PyThreadState_PushFrame(PyThreadState *tstate, size_t size);
+
+PyAPI_FUNC(void) _PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame *frame);
+
+/* Pushes a frame without checking for space.
+ * Must be guarded by _PyThreadState_HasStackSpace()
+ * Consumes reference to func. */
+static inline _PyInterpreterFrame *
+_PyFrame_PushUnchecked(PyThreadState *tstate, PyFunctionObject *func, int null_locals_from)
+{
+    CALL_STAT_INC(frames_pushed);
+    PyCodeObject *code = (PyCodeObject *)func->func_code;
+    _PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top;
+    tstate->datastack_top += code->co_framesize;
+    assert(tstate->datastack_top < tstate->datastack_limit);
+    _PyFrame_Initialize(new_frame, func, NULL, code, null_locals_from);
+    return new_frame;
+}
+
+/* Pushes a trampoline frame without checking for space.
+ * Must be guarded by _PyThreadState_HasStackSpace() */
+static inline _PyInterpreterFrame *
+_PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int stackdepth)
+{
+    CALL_STAT_INC(frames_pushed);
+    _PyInterpreterFrame *frame = (_PyInterpreterFrame *)tstate->datastack_top;
+    tstate->datastack_top += code->co_framesize;
+    assert(tstate->datastack_top < tstate->datastack_limit);
+    frame->f_funcobj = Py_None;
+    frame->f_executable = Py_NewRef(code);
+#ifdef Py_DEBUG
+    frame->f_builtins = NULL;
+    frame->f_globals = NULL;
+#endif
+    frame->f_locals = NULL;
+    frame->stacktop = code->co_nlocalsplus + stackdepth;
+    frame->frame_obj = NULL;
+    frame->instr_ptr = _PyCode_CODE(code);
+    frame->owner = FRAME_OWNED_BY_THREAD;
+    frame->return_offset = 0;
+    return frame;
+}
+
+static inline
+PyGenObject *_PyFrame_GetGenerator(_PyInterpreterFrame *frame)
+{
+    assert(frame->owner == FRAME_OWNED_BY_GENERATOR);
+    size_t offset_in_gen = offsetof(PyGenObject, gi_iframe);
+    return (PyGenObject *)(((char *)frame) - offset_in_gen);
+}
+
+PyAPI_FUNC(_PyInterpreterFrame *)
+_PyEvalFramePushAndInit(PyThreadState *tstate, PyFunctionObject *func,
+                        PyObject *locals, PyObject* const* args,
+                        size_t argcount, PyObject *kwnames);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FRAME_H */
diff --git a/Include/internal/pycore_freelist.h b/Include/internal/pycore_freelist.h
new file mode 100644
index 0000000000000000000000000000000000000000..e684e084b8bef8e932addd46684083085ab15c68
--- /dev/null
+++ b/Include/internal/pycore_freelist.h
@@ -0,0 +1,153 @@
+#ifndef Py_INTERNAL_FREELIST_H
+#define Py_INTERNAL_FREELIST_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// PyTuple_MAXSAVESIZE - largest tuple to save on free list
+// PyTuple_MAXFREELIST - maximum number of tuples of each size to save
+
+#ifdef WITH_FREELISTS
+// with freelists
+#  define PyTuple_MAXSAVESIZE 20
+#  define PyTuple_NFREELISTS PyTuple_MAXSAVESIZE
+#  define PyTuple_MAXFREELIST 2000
+#  define PyList_MAXFREELIST 80
+#  define PyDict_MAXFREELIST 80
+#  define PyFloat_MAXFREELIST 100
+#  define PyContext_MAXFREELIST 255
+# define _PyAsyncGen_MAXFREELIST 80
+#  define _PyObjectStackChunk_MAXFREELIST 4
+#else
+#  define PyTuple_NFREELISTS 0
+#  define PyTuple_MAXFREELIST 0
+#  define PyList_MAXFREELIST 0
+#  define PyDict_MAXFREELIST 0
+#  define PyFloat_MAXFREELIST 0
+#  define PyContext_MAXFREELIST 0
+#  define _PyAsyncGen_MAXFREELIST 0
+#  define _PyObjectStackChunk_MAXFREELIST 0
+#endif
+
+struct _Py_list_freelist {
+#ifdef WITH_FREELISTS
+    PyListObject *items[PyList_MAXFREELIST];
+    int numfree;
+#endif
+};
+
+struct _Py_tuple_freelist {
+#if WITH_FREELISTS
+    /* There is one freelist for each size from 1 to PyTuple_MAXSAVESIZE.
+       The empty tuple is handled separately.
+
+       Each tuple stored in the array is the head of the linked list
+       (and the next available tuple) for that size.  The actual tuple
+       object is used as the linked list node, with its first item
+       (ob_item[0]) pointing to the next node (i.e. the previous head).
+       Each linked list is initially NULL. */
+    PyTupleObject *items[PyTuple_NFREELISTS];
+    int numfree[PyTuple_NFREELISTS];
+#else
+    char _unused;  // Empty structs are not allowed.
+#endif
+};
+
+struct _Py_float_freelist {
+#ifdef WITH_FREELISTS
+    /* Special free list
+       free_list is a singly-linked list of available PyFloatObjects,
+       linked via abuse of their ob_type members. */
+    int numfree;
+    PyFloatObject *items;
+#endif
+};
+
+struct _Py_dict_freelist {
+#ifdef WITH_FREELISTS
+    /* Dictionary reuse scheme to save calls to malloc and free */
+    PyDictObject *items[PyDict_MAXFREELIST];
+    int numfree;
+#endif
+};
+
+struct _Py_dictkeys_freelist {
+#ifdef WITH_FREELISTS
+    /* Dictionary keys reuse scheme to save calls to malloc and free */
+    PyDictKeysObject *items[PyDict_MAXFREELIST];
+    int numfree;
+#endif
+};
+
+struct _Py_slice_freelist {
+#ifdef WITH_FREELISTS
+    /* Using a cache is very effective since typically only a single slice is
+       created and then deleted again. */
+    PySliceObject *slice_cache;
+#endif
+};
+
+struct _Py_context_freelist {
+#ifdef WITH_FREELISTS
+    // List of free PyContext objects
+    PyContext *items;
+    int numfree;
+#endif
+};
+
+struct _Py_async_gen_freelist {
+#ifdef WITH_FREELISTS
+    /* Freelists boost performance 6-10%; they also reduce memory
+       fragmentation, as _PyAsyncGenWrappedValue and PyAsyncGenASend
+       are short-living objects that are instantiated for every
+       __anext__() call. */
+    struct _PyAsyncGenWrappedValue* items[_PyAsyncGen_MAXFREELIST];
+    int numfree;
+#endif
+};
+
+struct _Py_async_gen_asend_freelist {
+#ifdef WITH_FREELISTS
+    struct PyAsyncGenASend* items[_PyAsyncGen_MAXFREELIST];
+    int numfree;
+#endif
+};
+
+struct _PyObjectStackChunk;
+
+struct _Py_object_stack_freelist {
+    struct _PyObjectStackChunk *items;
+    Py_ssize_t numfree;
+};
+
+struct _Py_object_freelists {
+    struct _Py_float_freelist floats;
+    struct _Py_tuple_freelist tuples;
+    struct _Py_list_freelist lists;
+    struct _Py_dict_freelist dicts;
+    struct _Py_dictkeys_freelist dictkeys;
+    struct _Py_slice_freelist slices;
+    struct _Py_context_freelist contexts;
+    struct _Py_async_gen_freelist async_gens;
+    struct _Py_async_gen_asend_freelist async_gen_asends;
+    struct _Py_object_stack_freelist object_stacks;
+};
+
+extern void _PyObject_ClearFreeLists(struct _Py_object_freelists *freelists, int is_finalization);
+extern void _PyTuple_ClearFreeList(struct _Py_object_freelists *freelists, int is_finalization);
+extern void _PyFloat_ClearFreeList(struct _Py_object_freelists *freelists, int is_finalization);
+extern void _PyList_ClearFreeList(struct _Py_object_freelists *freelists, int is_finalization);
+extern void _PySlice_ClearFreeList(struct _Py_object_freelists *freelists, int is_finalization);
+extern void _PyDict_ClearFreeList(struct _Py_object_freelists *freelists, int is_finalization);
+extern void _PyAsyncGen_ClearFreeLists(struct _Py_object_freelists *freelists, int is_finalization);
+extern void _PyContext_ClearFreeList(struct _Py_object_freelists *freelists, int is_finalization);
+extern void _PyObjectStackChunk_ClearFreeList(struct _Py_object_freelists *freelists, int is_finalization);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FREELIST_H */
diff --git a/Include/internal/pycore_function.h b/Include/internal/pycore_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d44e933e8a8cb3ad03291f47a891f60b1938277
--- /dev/null
+++ b/Include/internal/pycore_function.h
@@ -0,0 +1,55 @@
+#ifndef Py_INTERNAL_FUNCTION_H
+#define Py_INTERNAL_FUNCTION_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "pycore_lock.h"
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern PyObject* _PyFunction_Vectorcall(
+    PyObject *func,
+    PyObject *const *stack,
+    size_t nargsf,
+    PyObject *kwnames);
+
+#define FUNC_MAX_WATCHERS 8
+
+#define FUNC_VERSION_CACHE_SIZE (1<<12)  /* Must be a power of 2 */
+
+struct _func_version_cache_item {
+    PyFunctionObject *func;
+    PyObject *code;
+};
+
+struct _py_func_state {
+#ifdef Py_GIL_DISABLED
+    // Protects next_version
+    PyMutex mutex;
+#endif
+
+    uint32_t next_version;
+    // Borrowed references to function and code objects whose
+    // func_version % FUNC_VERSION_CACHE_SIZE
+    // once was equal to the index in the table.
+    // They are cleared when the function or code object is deallocated.
+    struct _func_version_cache_item func_version_cache[FUNC_VERSION_CACHE_SIZE];
+};
+
+extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr);
+
+extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func);
+PyAPI_FUNC(void) _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version);
+void _PyFunction_ClearCodeByVersion(uint32_t version);
+PyFunctionObject *_PyFunction_LookupByVersion(uint32_t version, PyObject **p_code);
+
+extern PyObject *_Py_set_function_type_params(
+    PyThreadState* unused, PyObject *func, PyObject *type_params);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FUNCTION_H */
diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h
new file mode 100644
index 0000000000000000000000000000000000000000..357177bcd6fd845852f36e5b11e8dd07249b0e0b
--- /dev/null
+++ b/Include/internal/pycore_gc.h
@@ -0,0 +1,365 @@
+#ifndef Py_INTERNAL_GC_H
+#define Py_INTERNAL_GC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_freelist.h"   // _PyFreeListState
+
+/* GC information is stored BEFORE the object structure. */
+typedef struct {
+    // Pointer to next object in the list.
+    // 0 means the object is not tracked
+    uintptr_t _gc_next;
+
+    // Pointer to previous object in the list.
+    // Lowest two bits are used for flags documented later.
+    uintptr_t _gc_prev;
+} PyGC_Head;
+
+#define _PyGC_Head_UNUSED PyGC_Head
+
+
+/* Get an object's GC head */
+static inline PyGC_Head* _Py_AS_GC(PyObject *op) {
+    char *gc = ((char*)op) - sizeof(PyGC_Head);
+    return (PyGC_Head*)gc;
+}
+
+/* Get the object given the GC head */
+static inline PyObject* _Py_FROM_GC(PyGC_Head *gc) {
+    char *op = ((char *)gc) + sizeof(PyGC_Head);
+    return (PyObject *)op;
+}
+
+
+/* Bit flags for ob_gc_bits (in Py_GIL_DISABLED builds)
+ *
+ * Setting the bits requires a relaxed store. The per-object lock must also be
+ * held, except when the object is only visible to a single thread (e.g. during
+ * object initialization or destruction).
+ *
+ * Reading the bits requires using a relaxed load, but does not require holding
+ * the per-object lock.
+ */
+#ifdef Py_GIL_DISABLED
+#  define _PyGC_BITS_TRACKED        (1)     // Tracked by the GC
+#  define _PyGC_BITS_FINALIZED      (2)     // tp_finalize was called
+#  define _PyGC_BITS_UNREACHABLE    (4)
+#  define _PyGC_BITS_FROZEN         (8)
+#  define _PyGC_BITS_SHARED         (16)
+#  define _PyGC_BITS_SHARED_INLINE  (32)
+#  define _PyGC_BITS_DEFERRED       (64)    // Use deferred reference counting
+#endif
+
+#ifdef Py_GIL_DISABLED
+
+static inline void
+_PyObject_SET_GC_BITS(PyObject *op, uint8_t new_bits)
+{
+    uint8_t bits = _Py_atomic_load_uint8_relaxed(&op->ob_gc_bits);
+    _Py_atomic_store_uint8_relaxed(&op->ob_gc_bits, bits | new_bits);
+}
+
+static inline int
+_PyObject_HAS_GC_BITS(PyObject *op, uint8_t bits)
+{
+    return (_Py_atomic_load_uint8_relaxed(&op->ob_gc_bits) & bits) != 0;
+}
+
+static inline void
+_PyObject_CLEAR_GC_BITS(PyObject *op, uint8_t bits_to_clear)
+{
+    uint8_t bits = _Py_atomic_load_uint8_relaxed(&op->ob_gc_bits);
+    _Py_atomic_store_uint8_relaxed(&op->ob_gc_bits, bits & ~bits_to_clear);
+}
+
+#endif
+
+/* True if the object is currently tracked by the GC. */
+static inline int _PyObject_GC_IS_TRACKED(PyObject *op) {
+#ifdef Py_GIL_DISABLED
+    return _PyObject_HAS_GC_BITS(op, _PyGC_BITS_TRACKED);
+#else
+    PyGC_Head *gc = _Py_AS_GC(op);
+    return (gc->_gc_next != 0);
+#endif
+}
+#define _PyObject_GC_IS_TRACKED(op) _PyObject_GC_IS_TRACKED(_Py_CAST(PyObject*, op))
+
+/* True if the object may be tracked by the GC in the future, or already is.
+   This can be useful to implement some optimizations. */
+static inline int _PyObject_GC_MAY_BE_TRACKED(PyObject *obj) {
+    if (!PyObject_IS_GC(obj)) {
+        return 0;
+    }
+    if (PyTuple_CheckExact(obj)) {
+        return _PyObject_GC_IS_TRACKED(obj);
+    }
+    return 1;
+}
+
+#ifdef Py_GIL_DISABLED
+
+/* True if memory the object references is shared between
+ * multiple threads and needs special purpose when freeing
+ * those references due to the possibility of in-flight
+ * lock-free reads occurring.  The object is responsible
+ * for calling _PyMem_FreeDelayed on the referenced
+ * memory. */
+static inline int _PyObject_GC_IS_SHARED(PyObject *op) {
+    return _PyObject_HAS_GC_BITS(op, _PyGC_BITS_SHARED);
+}
+#define _PyObject_GC_IS_SHARED(op) _PyObject_GC_IS_SHARED(_Py_CAST(PyObject*, op))
+
+static inline void _PyObject_GC_SET_SHARED(PyObject *op) {
+    _PyObject_SET_GC_BITS(op, _PyGC_BITS_SHARED);
+}
+#define _PyObject_GC_SET_SHARED(op) _PyObject_GC_SET_SHARED(_Py_CAST(PyObject*, op))
+
+/* True if the memory of the object is shared between multiple
+ * threads and needs special purpose when freeing due to
+ * the possibility of in-flight lock-free reads occurring.
+ * Objects with this bit that are GC objects will automatically
+ * delay-freed by PyObject_GC_Del. */
+static inline int _PyObject_GC_IS_SHARED_INLINE(PyObject *op) {
+    return _PyObject_HAS_GC_BITS(op, _PyGC_BITS_SHARED_INLINE);
+}
+#define _PyObject_GC_IS_SHARED_INLINE(op) \
+    _PyObject_GC_IS_SHARED_INLINE(_Py_CAST(PyObject*, op))
+
+static inline void _PyObject_GC_SET_SHARED_INLINE(PyObject *op) {
+    _PyObject_SET_GC_BITS(op, _PyGC_BITS_SHARED_INLINE);
+}
+#define _PyObject_GC_SET_SHARED_INLINE(op) \
+    _PyObject_GC_SET_SHARED_INLINE(_Py_CAST(PyObject*, op))
+
+#endif
+
+/* Bit flags for _gc_prev */
+/* Bit 0 is set when tp_finalize is called */
+#define _PyGC_PREV_MASK_FINALIZED  (1)
+/* Bit 1 is set when the object is in generation which is GCed currently. */
+#define _PyGC_PREV_MASK_COLLECTING (2)
+/* The (N-2) most significant bits contain the real address. */
+#define _PyGC_PREV_SHIFT           (2)
+#define _PyGC_PREV_MASK            (((uintptr_t) -1) << _PyGC_PREV_SHIFT)
+
+/* set for debugging information */
+#define _PyGC_DEBUG_STATS             (1<<0) /* print collection statistics */
+#define _PyGC_DEBUG_COLLECTABLE       (1<<1) /* print collectable objects */
+#define _PyGC_DEBUG_UNCOLLECTABLE     (1<<2) /* print uncollectable objects */
+#define _PyGC_DEBUG_SAVEALL           (1<<5) /* save all garbage in gc.garbage */
+#define _PyGC_DEBUG_LEAK              _PyGC_DEBUG_COLLECTABLE | \
+                                      _PyGC_DEBUG_UNCOLLECTABLE | \
+                                      _PyGC_DEBUG_SAVEALL
+
+typedef enum {
+    // GC was triggered by heap allocation
+    _Py_GC_REASON_HEAP,
+
+    // GC was called during shutdown
+    _Py_GC_REASON_SHUTDOWN,
+
+    // GC was called by gc.collect() or PyGC_Collect()
+    _Py_GC_REASON_MANUAL
+} _PyGC_Reason;
+
+// Lowest bit of _gc_next is used for flags only in GC.
+// But it is always 0 for normal code.
+static inline PyGC_Head* _PyGCHead_NEXT(PyGC_Head *gc) {
+    uintptr_t next = gc->_gc_next;
+    return (PyGC_Head*)next;
+}
+static inline void _PyGCHead_SET_NEXT(PyGC_Head *gc, PyGC_Head *next) {
+    gc->_gc_next = (uintptr_t)next;
+}
+
+// Lowest two bits of _gc_prev is used for _PyGC_PREV_MASK_* flags.
+static inline PyGC_Head* _PyGCHead_PREV(PyGC_Head *gc) {
+    uintptr_t prev = (gc->_gc_prev & _PyGC_PREV_MASK);
+    return (PyGC_Head*)prev;
+}
+static inline void _PyGCHead_SET_PREV(PyGC_Head *gc, PyGC_Head *prev) {
+    uintptr_t uprev = (uintptr_t)prev;
+    assert((uprev & ~_PyGC_PREV_MASK) == 0);
+    gc->_gc_prev = ((gc->_gc_prev & ~_PyGC_PREV_MASK) | uprev);
+}
+
+static inline int _PyGC_FINALIZED(PyObject *op) {
+#ifdef Py_GIL_DISABLED
+    return _PyObject_HAS_GC_BITS(op, _PyGC_BITS_FINALIZED);
+#else
+    PyGC_Head *gc = _Py_AS_GC(op);
+    return ((gc->_gc_prev & _PyGC_PREV_MASK_FINALIZED) != 0);
+#endif
+}
+static inline void _PyGC_SET_FINALIZED(PyObject *op) {
+#ifdef Py_GIL_DISABLED
+    _PyObject_SET_GC_BITS(op, _PyGC_BITS_FINALIZED);
+#else
+    PyGC_Head *gc = _Py_AS_GC(op);
+    gc->_gc_prev |= _PyGC_PREV_MASK_FINALIZED;
+#endif
+}
+static inline void _PyGC_CLEAR_FINALIZED(PyObject *op) {
+#ifdef Py_GIL_DISABLED
+    _PyObject_CLEAR_GC_BITS(op, _PyGC_BITS_FINALIZED);
+#else
+    PyGC_Head *gc = _Py_AS_GC(op);
+    gc->_gc_prev &= ~_PyGC_PREV_MASK_FINALIZED;
+#endif
+}
+
+
+/* GC runtime state */
+
+/* If we change this, we need to change the default value in the
+   signature of gc.collect. */
+#define NUM_GENERATIONS 3
+/*
+   NOTE: about untracking of mutable objects.
+
+   Certain types of container cannot participate in a reference cycle, and
+   so do not need to be tracked by the garbage collector. Untracking these
+   objects reduces the cost of garbage collections. However, determining
+   which objects may be untracked is not free, and the costs must be
+   weighed against the benefits for garbage collection.
+
+   There are two possible strategies for when to untrack a container:
+
+   i) When the container is created.
+   ii) When the container is examined by the garbage collector.
+
+   Tuples containing only immutable objects (integers, strings etc, and
+   recursively, tuples of immutable objects) do not need to be tracked.
+   The interpreter creates a large number of tuples, many of which will
+   not survive until garbage collection. It is therefore not worthwhile
+   to untrack eligible tuples at creation time.
+
+   Instead, all tuples except the empty tuple are tracked when created.
+   During garbage collection it is determined whether any surviving tuples
+   can be untracked. A tuple can be untracked if all of its contents are
+   already not tracked. Tuples are examined for untracking in all garbage
+   collection cycles. It may take more than one cycle to untrack a tuple.
+
+   Dictionaries containing only immutable objects also do not need to be
+   tracked. Dictionaries are untracked when created. If a tracked item is
+   inserted into a dictionary (either as a key or value), the dictionary
+   becomes tracked. During a full garbage collection (all generations),
+   the collector will untrack any dictionaries whose contents are not
+   tracked.
+
+   The module provides the python function is_tracked(obj), which returns
+   the CURRENT tracking status of the object. Subsequent garbage
+   collections may change the tracking status of the object.
+
+   Untracking of certain containers was introduced in issue #4688, and
+   the algorithm was refined in response to issue #14775.
+*/
+
+struct gc_generation {
+    PyGC_Head head;
+    int threshold; /* collection threshold */
+    int count; /* count of allocations or collections of younger
+                  generations */
+};
+
+/* Running stats per generation */
+struct gc_generation_stats {
+    /* total number of collections */
+    Py_ssize_t collections;
+    /* total number of collected objects */
+    Py_ssize_t collected;
+    /* total number of uncollectable objects (put into gc.garbage) */
+    Py_ssize_t uncollectable;
+};
+
+struct _gc_runtime_state {
+    /* List of objects that still need to be cleaned up, singly linked
+     * via their gc headers' gc_prev pointers.  */
+    PyObject *trash_delete_later;
+    /* Current call-stack depth of tp_dealloc calls. */
+    int trash_delete_nesting;
+
+    /* Is automatic collection enabled? */
+    int enabled;
+    int debug;
+    /* linked lists of container objects */
+    struct gc_generation generations[NUM_GENERATIONS];
+    PyGC_Head *generation0;
+    /* a permanent generation which won't be collected */
+    struct gc_generation permanent_generation;
+    struct gc_generation_stats generation_stats[NUM_GENERATIONS];
+    /* true if we are currently running the collector */
+    int collecting;
+    /* list of uncollectable objects */
+    PyObject *garbage;
+    /* a list of callbacks to be invoked when collection is performed */
+    PyObject *callbacks;
+
+    /* This is the number of objects that survived the last full
+       collection. It approximates the number of long lived objects
+       tracked by the GC.
+
+       (by "full collection", we mean a collection of the oldest
+       generation). */
+    Py_ssize_t long_lived_total;
+    /* This is the number of objects that survived all "non-full"
+       collections, and are awaiting to undergo a full collection for
+       the first time. */
+    Py_ssize_t long_lived_pending;
+
+#ifdef Py_GIL_DISABLED
+    /* gh-117783: Deferred reference counting is not fully implemented yet, so
+       as a temporary measure we treat objects using deferred reference
+       counting as immortal. The value may be zero, one, or a negative number:
+        0: immortalize deferred RC objects once the first thread is created
+        1: immortalize all deferred RC objects immediately
+        <0: suppressed; don't immortalize objects */
+    int immortalize;
+#endif
+};
+
+#ifdef Py_GIL_DISABLED
+struct _gc_thread_state {
+    /* Thread-local allocation count. */
+    Py_ssize_t alloc_count;
+};
+#endif
+
+
+extern void _PyGC_InitState(struct _gc_runtime_state *);
+
+extern Py_ssize_t _PyGC_Collect(PyThreadState *tstate, int generation,
+                                _PyGC_Reason reason);
+extern void _PyGC_CollectNoFail(PyThreadState *tstate);
+
+/* Freeze objects tracked by the GC and ignore them in future collections. */
+extern void _PyGC_Freeze(PyInterpreterState *interp);
+/* Unfreezes objects placing them in the oldest generation */
+extern void _PyGC_Unfreeze(PyInterpreterState *interp);
+/* Number of frozen objects */
+extern Py_ssize_t _PyGC_GetFreezeCount(PyInterpreterState *interp);
+
+extern PyObject *_PyGC_GetObjects(PyInterpreterState *interp, int generation);
+extern PyObject *_PyGC_GetReferrers(PyInterpreterState *interp, PyObject *objs);
+
+// Functions to clear types free lists
+extern void _PyGC_ClearAllFreeLists(PyInterpreterState *interp);
+extern void _Py_ScheduleGC(PyThreadState *tstate);
+extern void _Py_RunGC(PyThreadState *tstate);
+
+#ifdef Py_GIL_DISABLED
+// gh-117783: Immortalize objects that use deferred reference counting
+extern void _PyGC_ImmortalizeDeferredObjects(PyInterpreterState *interp);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GC_H */
diff --git a/Include/internal/pycore_genobject.h b/Include/internal/pycore_genobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..9463c822ad86698e64aced58adfc1088cb14ab29
--- /dev/null
+++ b/Include/internal/pycore_genobject.h
@@ -0,0 +1,32 @@
+#ifndef Py_INTERNAL_GENOBJECT_H
+#define Py_INTERNAL_GENOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_freelist.h"
+
+PyAPI_FUNC(PyObject *)_PyGen_yf(PyGenObject *);
+extern void _PyGen_Finalize(PyObject *self);
+
+// Export for '_asyncio' shared extension
+PyAPI_FUNC(int) _PyGen_SetStopIterationValue(PyObject *);
+
+// Export for '_asyncio' shared extension
+PyAPI_FUNC(int) _PyGen_FetchStopIterationValue(PyObject **);
+
+PyAPI_FUNC(PyObject *)_PyCoro_GetAwaitableIter(PyObject *o);
+extern PyObject *_PyAsyncGenValueWrapperNew(PyThreadState *state, PyObject *);
+
+extern PyTypeObject _PyCoroWrapper_Type;
+extern PyTypeObject _PyAsyncGenWrappedValue_Type;
+extern PyTypeObject _PyAsyncGenAThrow_Type;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GENOBJECT_H */
diff --git a/Include/internal/pycore_getopt.h b/Include/internal/pycore_getopt.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f0dd13ae577f78491f19b7e8fa3ac8c5042bb11
--- /dev/null
+++ b/Include/internal/pycore_getopt.h
@@ -0,0 +1,22 @@
+#ifndef Py_INTERNAL_PYGETOPT_H
+#define Py_INTERNAL_PYGETOPT_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern int _PyOS_opterr;
+extern Py_ssize_t _PyOS_optind;
+extern const wchar_t *_PyOS_optarg;
+
+extern void _PyOS_ResetGetOpt(void);
+
+typedef struct {
+    const wchar_t *name;
+    int has_arg;
+    int val;
+} _PyOS_LongOption;
+
+extern int _PyOS_GetOpt(Py_ssize_t argc, wchar_t * const *argv, int *longindex);
+
+#endif /* !Py_INTERNAL_PYGETOPT_H */
diff --git a/Include/internal/pycore_gil.h b/Include/internal/pycore_gil.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2de5077371ebae37b9b17e870c3195f0e8b7714
--- /dev/null
+++ b/Include/internal/pycore_gil.h
@@ -0,0 +1,66 @@
+#ifndef Py_INTERNAL_GIL_H
+#define Py_INTERNAL_GIL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_condvar.h"       // PyCOND_T
+
+#ifndef Py_HAVE_CONDVAR
+#  error You need either a POSIX-compatible or a Windows system!
+#endif
+
+/* Enable if you want to force the switching of threads at least
+   every `interval`. */
+#undef FORCE_SWITCHING
+#define FORCE_SWITCHING
+
+struct _gil_runtime_state {
+#ifdef Py_GIL_DISABLED
+    /* If this GIL is disabled, enabled == 0.
+
+       If this GIL is enabled transiently (most likely to initialize a module
+       of unknown safety), enabled indicates the number of active transient
+       requests.
+
+       If this GIL is enabled permanently, enabled == INT_MAX.
+
+       It must not be modified directly; use _PyEval_EnableGILTransiently(),
+       _PyEval_EnableGILPermanently(), and _PyEval_DisableGIL()
+
+       It is always read and written atomically, but a thread can assume its
+       value will be stable as long as that thread is attached or knows that no
+       other threads are attached (e.g., during a stop-the-world.). */
+    int enabled;
+#endif
+    /* microseconds (the Python API uses seconds, though) */
+    unsigned long interval;
+    /* Last PyThreadState holding / having held the GIL. This helps us
+       know whether anyone else was scheduled after we dropped the GIL. */
+    PyThreadState* last_holder;
+    /* Whether the GIL is already taken (-1 if uninitialized). This is
+       atomic because it can be read without any lock taken in ceval.c. */
+    int locked;
+    /* Number of GIL switches since the beginning. */
+    unsigned long switch_number;
+    /* This condition variable allows one or several threads to wait
+       until the GIL is released. In addition, the mutex also protects
+       the above variables. */
+    PyCOND_T cond;
+    PyMUTEX_T mutex;
+#ifdef FORCE_SWITCHING
+    /* This condition variable helps the GIL-releasing thread wait for
+       a GIL-awaiting thread to be scheduled and take the GIL. */
+    PyCOND_T switch_cond;
+    PyMUTEX_T switch_mutex;
+#endif
+};
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GIL_H */
diff --git a/Include/internal/pycore_global_objects.h b/Include/internal/pycore_global_objects.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d376e7db021efc929ac3f51e7dee0b3176f7b76
--- /dev/null
+++ b/Include/internal/pycore_global_objects.h
@@ -0,0 +1,105 @@
+#ifndef Py_INTERNAL_GLOBAL_OBJECTS_H
+#define Py_INTERNAL_GLOBAL_OBJECTS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_context.h"         // _PyContextTokenMissing
+#include "pycore_gc.h"              // _PyGC_Head_UNUSED
+#include "pycore_global_strings.h"  // struct _Py_global_strings
+#include "pycore_hamt.h"            // PyHamtNode_Bitmap
+#include "pycore_hashtable.h"       // _Py_hashtable_t
+#include "pycore_typeobject.h"      // pytype_slotdef
+
+
+// These would be in pycore_long.h if it weren't for an include cycle.
+#define _PY_NSMALLPOSINTS           257
+#define _PY_NSMALLNEGINTS           5
+
+
+// Only immutable objects should be considered runtime-global.
+// All others must be per-interpreter.
+
+#define _Py_GLOBAL_OBJECT(NAME) \
+    _PyRuntime.static_objects.NAME
+#define _Py_SINGLETON(NAME) \
+    _Py_GLOBAL_OBJECT(singletons.NAME)
+
+struct _Py_cached_objects {
+    // XXX We could statically allocate the hashtable.
+    _Py_hashtable_t *interned_strings;
+};
+
+struct _Py_static_objects {
+    struct {
+        /* Small integers are preallocated in this array so that they
+         * can be shared.
+         * The integers that are preallocated are those in the range
+         * -_PY_NSMALLNEGINTS (inclusive) to _PY_NSMALLPOSINTS (exclusive).
+         */
+        PyLongObject small_ints[_PY_NSMALLNEGINTS + _PY_NSMALLPOSINTS];
+
+        PyBytesObject bytes_empty;
+        struct {
+            PyBytesObject ob;
+            char eos;
+        } bytes_characters[256];
+
+        struct _Py_global_strings strings;
+
+        _PyGC_Head_UNUSED _tuple_empty_gc_not_used;
+        PyTupleObject tuple_empty;
+
+        _PyGC_Head_UNUSED _hamt_bitmap_node_empty_gc_not_used;
+        PyHamtNode_Bitmap hamt_bitmap_node_empty;
+        _PyContextTokenMissing context_token_missing;
+    } singletons;
+};
+
+#define _Py_INTERP_CACHED_OBJECT(interp, NAME) \
+    (interp)->cached_objects.NAME
+
+struct _Py_interp_cached_objects {
+    PyObject *interned_strings;
+
+    /* AST */
+    PyObject *_unused_str_replace_inf;  // kept in 3.13 for ABI compatibility
+
+    /* object.__reduce__ */
+    PyObject *objreduce;
+    PyObject *type_slots_pname;
+    pytype_slotdef *type_slots_ptrs[MAX_EQUIV];
+
+    /* TypeVar and related types */
+    PyTypeObject *generic_type;
+    PyTypeObject *typevar_type;
+    PyTypeObject *typevartuple_type;
+    PyTypeObject *paramspec_type;
+    PyTypeObject *paramspecargs_type;
+    PyTypeObject *paramspeckwargs_type;
+};
+
+#define _Py_INTERP_STATIC_OBJECT(interp, NAME) \
+    (interp)->static_objects.NAME
+#define _Py_INTERP_SINGLETON(interp, NAME) \
+    _Py_INTERP_STATIC_OBJECT(interp, singletons.NAME)
+
+struct _Py_interp_static_objects {
+    struct {
+        int _not_used;
+        // hamt_empty is here instead of global because of its weakreflist.
+        _PyGC_Head_UNUSED _hamt_empty_gc_not_used;
+        PyHamtObject hamt_empty;
+        PyBaseExceptionObject last_resort_memory_error;
+    } singletons;
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GLOBAL_OBJECTS_H */
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd56ffde9e555e607ad3c98c81c7e4b6441286a9
--- /dev/null
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -0,0 +1,1554 @@
+#ifndef Py_INTERNAL_GLOBAL_OBJECTS_FINI_GENERATED_INIT_H
+#define Py_INTERNAL_GLOBAL_OBJECTS_FINI_GENERATED_INIT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef Py_DEBUG
+static inline void
+_PyStaticObject_CheckRefcnt(PyObject *obj) {
+    if (Py_REFCNT(obj) < _Py_IMMORTAL_REFCNT) {
+        fprintf(stderr, "Immortal Object has less refcnt than expected.\n");
+        _PyObject_Dump(obj);
+    }
+}
+#endif
+
+/* The following is auto-generated by Tools/build/generate_global_objects.py. */
+#ifdef Py_DEBUG
+static inline void
+_PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
+    /* generated runtime-global */
+    // (see pycore_runtime_init_generated.h)
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + -5]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + -4]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + -3]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + -2]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + -1]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 0]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 1]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 2]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 3]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 4]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 5]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 6]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 7]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 8]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 9]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 10]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 11]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 12]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 13]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 14]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 15]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 16]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 17]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 18]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 19]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 20]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 21]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 22]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 23]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 24]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 25]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 26]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 27]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 28]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 29]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 30]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 31]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 32]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 33]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 34]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 35]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 36]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 37]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 38]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 39]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 40]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 41]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 42]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 43]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 44]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 45]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 46]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 47]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 48]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 49]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 50]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 51]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 52]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 53]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 54]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 55]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 56]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 57]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 58]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 59]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 60]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 61]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 62]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 63]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 64]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 65]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 66]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 67]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 68]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 69]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 70]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 71]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 72]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 73]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 74]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 75]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 76]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 77]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 78]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 79]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 80]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 81]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 82]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 83]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 84]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 85]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 86]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 87]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 88]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 89]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 90]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 91]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 92]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 93]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 94]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 95]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 96]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 97]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 98]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 99]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 100]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 101]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 102]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 103]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 104]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 105]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 106]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 107]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 108]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 109]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 110]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 111]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 112]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 113]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 114]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 115]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 116]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 117]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 118]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 119]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 120]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 121]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 122]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 123]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 124]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 125]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 126]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 127]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 129]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 130]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 131]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 132]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 133]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 134]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 135]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 136]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 137]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 138]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 139]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 140]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 141]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 142]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 143]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 144]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 145]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 146]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 147]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 148]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 149]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 150]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 151]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 152]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 153]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 154]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 155]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 156]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 157]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 158]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 159]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 160]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 161]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 162]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 163]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 164]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 165]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 166]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 167]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 168]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 169]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 170]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 171]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 172]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 173]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 174]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 175]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 176]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 177]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 178]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 179]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 180]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 181]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 182]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 183]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 184]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 185]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 186]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 187]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 188]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 189]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 190]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 191]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 192]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 193]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 194]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 195]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 196]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 197]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 198]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 199]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 200]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 201]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 202]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 203]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 204]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 205]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 206]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 207]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 208]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 209]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 210]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 211]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 212]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 213]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 214]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 215]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 216]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 217]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 218]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 219]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 220]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 221]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 222]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 223]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 224]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 225]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 226]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 227]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 228]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 229]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 230]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 231]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 232]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 233]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 234]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 235]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 236]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 237]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 238]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 239]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 240]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 241]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 242]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 243]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 244]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 245]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 246]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 247]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 248]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 249]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 250]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 251]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 252]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 253]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 254]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 255]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 256]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[0]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[1]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[2]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[3]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[4]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[5]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[6]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[7]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[8]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[9]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[10]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[11]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[12]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[13]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[14]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[15]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[16]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[17]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[18]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[19]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[20]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[21]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[22]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[23]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[24]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[25]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[26]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[27]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[28]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[29]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[30]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[31]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[32]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[33]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[34]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[35]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[36]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[37]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[38]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[39]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[40]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[41]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[42]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[43]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[44]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[45]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[46]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[47]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[48]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[49]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[50]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[51]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[52]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[53]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[54]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[55]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[56]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[57]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[58]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[59]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[60]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[61]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[62]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[63]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[64]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[65]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[66]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[67]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[68]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[69]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[70]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[71]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[72]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[73]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[74]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[75]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[76]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[77]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[78]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[79]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[80]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[81]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[82]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[83]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[84]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[85]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[86]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[87]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[88]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[89]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[90]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[91]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[92]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[93]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[94]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[95]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[96]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[97]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[98]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[99]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[100]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[101]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[102]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[103]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[104]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[105]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[106]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[107]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[108]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[109]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[110]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[111]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[112]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[113]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[114]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[115]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[116]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[117]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[118]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[119]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[120]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[121]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[122]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[123]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[124]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[125]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[126]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[127]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[129]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[130]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[131]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[132]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[133]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[134]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[135]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[136]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[137]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[138]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[139]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[140]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[141]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[142]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[143]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[144]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[145]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[146]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[147]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[148]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[149]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[150]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[151]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[152]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[153]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[154]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[155]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[156]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[157]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[158]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[159]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[160]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[161]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[162]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[163]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[164]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[165]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[166]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[167]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[168]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[169]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[170]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[171]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[172]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[173]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[174]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[175]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[176]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[177]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[178]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[179]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[180]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[181]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[182]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[183]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[184]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[185]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[186]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[187]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[188]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[189]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[190]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[191]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[192]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[193]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[194]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[195]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[196]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[197]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[198]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[199]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[200]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[201]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[202]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[203]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[204]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[205]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[206]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[207]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[208]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[209]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[210]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[211]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[212]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[213]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[214]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[215]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[216]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[217]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[218]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[219]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[220]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[221]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[222]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[223]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[224]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[225]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[226]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[227]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[228]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[229]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[230]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[231]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[232]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[233]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[234]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[235]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[236]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[237]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[238]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[239]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[240]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[241]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[242]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[243]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[244]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[245]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[246]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[247]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[248]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[249]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[250]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[251]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[252]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[253]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[254]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[255]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_dictcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_genexpr));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_lambda));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_listcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_module));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_null));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_setcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_string));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_unknown));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(dbl_close_br));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(dbl_open_br));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(dbl_percent));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(defaults));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(dot_locals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(empty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(generic_base));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(json_decoder));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(kwdefaults));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(list_err));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(str_replace_inf));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(type_params));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(utf_8));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(CANCELLED));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(FINISHED));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(False));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(JSONDecodeError));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(PENDING));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(Py_Repr));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(TextIOWrapper));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(True));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(WarningMessage));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_WindowsConsoleIO));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__IOBase_closed));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__abc_tpflags__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__abs__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__abstractmethods__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__add__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__aenter__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__aexit__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__aiter__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__all__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__and__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__anext__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__annotations__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__args__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__await__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__bases__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__bool__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__buffer__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__build_class__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__builtins__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__bytes__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__call__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__cantrace__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__class__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__class_getitem__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__classcell__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__classdict__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__classdictcell__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__complex__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__contains__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__copy__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ctypes_from_outparam__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__del__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__delattr__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__delete__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__delitem__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__dict__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__dictoffset__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__dir__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__divmod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__doc__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__enter__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__eq__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__exit__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__file__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__firstlineno__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__float__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__floordiv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__format__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__fspath__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ge__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__get__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getattr__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getattribute__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getinitargs__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getitem__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getnewargs__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getnewargs_ex__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getstate__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__gt__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__hash__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__iadd__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__iand__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ifloordiv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ilshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__imatmul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__imod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__import__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__imul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__index__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__init__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__init_subclass__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__instancecheck__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__int__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__invert__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ior__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ipow__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__irshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__isabstractmethod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__isub__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__iter__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__itruediv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ixor__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__le__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__len__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__length_hint__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__lltrace__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__loader__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__lshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__lt__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__main__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__match_args__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__matmul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__missing__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__mod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__module__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__mro_entries__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__mul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__name__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ne__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__neg__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__new__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__newobj__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__newobj_ex__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__next__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__notes__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__or__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__orig_class__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__origin__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__package__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__parameters__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__path__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__pos__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__pow__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__prepare__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__qualname__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__radd__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rand__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rdivmod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__reduce__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__reduce_ex__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__release_buffer__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__repr__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__reversed__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rfloordiv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rlshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rmatmul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rmod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rmul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ror__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__round__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rpow__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rrshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rsub__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rtruediv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rxor__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__set__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__set_name__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__setattr__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__setitem__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__setstate__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__sizeof__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__slotnames__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__slots__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__spec__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__static_attributes__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__str__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__sub__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__subclasscheck__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__subclasshook__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__truediv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__trunc__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__type_params__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__typing_is_unpacked_typevartuple__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__typing_prepare_subst__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__typing_subst__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__typing_unpacked_tuple_args__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__warningregistry__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__weaklistoffset__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__weakref__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__xor__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_abc_impl));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_abstract_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_active));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_align_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_annotation));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_anonymous_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_argtypes_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_as_parameter_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_asyncio_future_blocking));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_blksize));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_bootstrap));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_check_retval_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_dealloc_warn));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_feature_version));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_field_types));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_fields_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_finalizing));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_find_and_load));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_fix_up_module));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_flags_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_get_sourcefile));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_handle_fromlist));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_initializing));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_io));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_is_text_encoding));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_length_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_limbo));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_lock_unlock_module));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_loop));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_needs_com_addref_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_only_immortal));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_pack_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_restype_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_showwarnmsg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_shutdown));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_slotnames));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_strptime));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_strptime_datetime));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_swappedbytes_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_type_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_uninitialized_submodules));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_warn_unawaited_coroutine));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_xoptions));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(abs_tol));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(access));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(aclose));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(add));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(add_done_callback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(after_in_child));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(after_in_parent));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(aggregate_class));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(alias));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(allow_code));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(append));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(arg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(argdefs));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(args));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(arguments));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(argv));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(as_integer_ratio));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(asend));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ast));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(athrow));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(attribute));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(authorizer_callback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(autocommit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(backtick));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(base));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(before));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(big));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(binary_form));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(block));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bound));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(buffer));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(buffer_callback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(buffer_size));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(buffering));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(buffers));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bufsize));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(builtins));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(byteorder));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bytes));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bytes_per_sep));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_call));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_exception));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_return));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_datetime_module));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_statements));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cadata));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cafile));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(call));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(call_exception_handler));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(call_soon));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(callback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cancel));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(capath));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(category));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cb_type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(certfile));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(check_same_thread));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(clear));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(close));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(closed));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(closefd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(closure));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_argcount));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_cellvars));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_code));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_consts));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_exceptiontable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_filename));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_firstlineno));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_flags));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_freevars));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_kwonlyargcount));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_linetable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_names));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_nlocals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_posonlyargcount));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_qualname));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_stacksize));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_varnames));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(code));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(col_offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(command));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(comment_factory));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compile_mode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(consts));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(context));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(contravariant));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cookie));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(copy));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(copyreg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(coro));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(count));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(covariant));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cwd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(data));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(database));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(day));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(decode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(decoder));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(default));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(defaultaction));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(delete));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(depth));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(desired_access));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(detect_types));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(deterministic));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(device));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dict));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dictcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(difference_update));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(digest));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(digest_size));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(digestmod));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dir_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(discard));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dispatch_table));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(displayhook));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dklen));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(doc));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dont_inherit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dst));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dst_dir_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(eager_start));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(effective_ids));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(element_factory));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(encode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(encoding));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(end));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(end_col_offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(end_lineno));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(end_offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(endpos));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(entrypoint));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(env));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(errors));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(event));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(eventmask));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_value));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(excepthook));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(existing_file_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extend));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extra_tokens));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(facility));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(factory));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(false));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(family));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fanout));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fd2));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fdel));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fget));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(file));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(file_actions));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(filename));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fileno));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(filepath));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fillvalue));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(filter));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(filters));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(final));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(find_class));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fix_imports));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(flags));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(flush));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fold));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(follow_symlinks));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(format));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(from_param));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fromlist));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fromtimestamp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fromutc));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(func));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(future));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(generation));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(genexpr));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(get));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(get_debug));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(get_event_loop));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(get_loop));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(get_source));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(getattr));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(getstate));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(gid));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(globals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(groupindex));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(groups));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(handle));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(handle_seq));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(has_location));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(hash_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(header));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(headers));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(hi));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(hook));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(hour));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ident));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(identity_hint));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ignore));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(imag));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(importlib));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(in_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(incoming));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(indexgroup));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(inf));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(infer_variance));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(inherit_handle));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(inheritable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(initial));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(initial_bytes));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(initial_owner));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(initial_state));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(initial_value));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(initval));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(inner_size));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(input));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(insert_comments));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(insert_pis));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(instructions));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(intern));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(intersection));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(interval));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(is_running));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isatty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isinstance));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isoformat));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isolation_level));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(istext));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(item));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(items));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(iter));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(iterable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(iterations));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(join));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(jump));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(keepends));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(key));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(keyfile));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(keys));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(kind));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(kw));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(kw1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(kw2));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(kwdefaults));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(label));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(lambda));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last_exc));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last_node));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last_traceback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last_type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last_value));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(latin1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(leaf_size));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(len));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(length));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(level));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(limit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(line));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(line_buffering));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(lineno));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(listcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(little));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(lo));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(locale));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(locals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(logoption));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(loop));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(manual_reset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mapping));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(match));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(max_length));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxdigits));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxevents));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxlen));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxmem));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxsplit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxvalue));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(memLevel));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(memlimit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(message));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(metaclass));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(metadata));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(method));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(microsecond));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(milliseconds));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(minute));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mod));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(module));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(module_globals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(modules));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(month));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mro));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(msg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mutex));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mycmp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(n_arg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(n_fields));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(n_sequence_fields));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(n_unnamed_fields));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(name_from));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(namespace_separator));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(namespaces));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(narg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ndigits));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nested));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(new_file_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(new_limit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(newline));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(newlines));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(next));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nlocals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(node_depth));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(node_offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ns));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nstype));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nt));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(null));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(number));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(obj));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(object));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(offset_dst));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(offset_src));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(on_type_read));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(onceregistry));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(only_keys));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(oparg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(opcode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(open));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(opener));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(operation));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(optimize));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(options));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(order));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(origin));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(out_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(outgoing));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(overlapped));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(owner));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pages));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(parent));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(password));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(path));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pattern));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(peek));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(persistent_id));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(persistent_load));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(person));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pi_factory));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pid));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(policy));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pos));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pos1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pos2));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(posix));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(print_file_and_line));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(priority));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_handler));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_routine));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(proto));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(protocol));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ps1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ps2));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(query));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(quotetabs));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(raw));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(read));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(read1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readall));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readinto));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readinto1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readline));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readonly));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(real));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reducer_override));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(registry));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(rel_tol));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(release));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reload));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(repl));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(replace));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reserved));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(resetids));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(return));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reverse));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reversed));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(salt));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sched_priority));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(scheduler));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(second));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(security_attributes));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(seek));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(seekable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(selectors));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(self));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(send));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sep));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sequence));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(server_hostname));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(server_side));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(session));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setpgroup));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setsid));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setsigdef));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setsigmask));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setstate));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(shape));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(show_cmd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(signed));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(size));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sizehint));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(skip_file_prefixes));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sleep));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sock));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sort));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(source));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(source_traceback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(spam));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src_dir_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stderr));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stdin));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stdout));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(step));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(steps));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(store_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(strategy));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(strftime));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(strict));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(strict_mode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(string));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sub_key));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(symmetric_difference_update));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tabsize));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tag));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(target));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(target_is_directory));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(task));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tb_frame));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tb_lasti));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tb_lineno));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tb_next));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tell));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(template));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(term));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(text));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(threading));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(throw));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timeout));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(times));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timetuple));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(top));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(trace_callback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(traceback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(trailers));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(translate));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(true));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(truncate));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(twice));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(txt));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(type_params));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tz));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tzinfo));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tzname));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(uid));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(unlink));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(unraisablehook));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(uri));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(usedforsecurity));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(value));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(values));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(version));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(volume));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(wait_all));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(warn_on_full_buffer));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(warnings));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(warnoptions));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(wbits));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(week));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(weekday));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(which));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(who));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(withdata));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(writable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(write));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(write_through));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(year));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(zdict));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[0]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[1]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[2]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[3]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[4]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[5]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[6]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[7]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[8]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[9]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[10]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[11]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[12]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[13]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[14]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[15]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[16]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[17]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[18]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[19]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[20]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[21]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[22]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[23]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[24]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[25]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[26]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[27]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[28]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[29]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[30]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[31]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[32]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[33]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[34]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[35]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[36]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[37]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[38]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[39]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[40]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[41]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[42]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[43]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[44]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[45]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[46]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[47]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[48]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[49]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[50]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[51]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[52]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[53]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[54]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[55]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[56]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[57]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[58]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[59]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[60]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[61]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[62]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[63]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[64]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[65]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[66]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[67]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[68]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[69]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[70]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[71]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[72]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[73]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[74]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[75]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[76]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[77]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[78]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[79]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[80]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[81]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[82]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[83]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[84]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[85]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[86]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[87]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[88]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[89]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[90]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[91]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[92]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[93]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[94]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[95]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[96]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[97]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[98]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[99]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[100]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[101]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[102]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[103]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[104]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[105]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[106]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[107]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[108]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[109]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[110]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[111]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[112]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[113]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[114]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[115]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[116]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[117]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[118]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[119]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[120]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[121]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[122]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[123]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[124]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[125]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[126]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[127]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[128 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[129 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[130 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[131 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[132 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[133 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[134 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[135 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[136 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[137 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[138 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[139 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[140 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[141 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[142 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[143 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[144 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[145 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[146 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[147 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[148 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[149 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[150 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[151 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[152 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[153 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[154 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[155 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[156 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[157 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[158 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[159 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[160 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[161 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[162 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[163 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[164 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[165 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[166 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[167 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[168 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[169 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[170 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[171 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[172 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[173 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[174 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[175 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[176 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[177 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[178 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[179 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[180 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[181 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[182 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[183 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[184 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[185 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[186 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[187 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[188 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[189 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[190 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[191 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[192 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[193 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[194 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[195 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[196 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[197 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[198 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[199 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[200 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[201 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[202 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[203 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[204 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[205 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[206 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[207 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[208 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[209 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[210 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[211 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[212 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[213 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[214 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[215 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[216 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[217 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[218 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[219 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[220 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[221 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[222 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[223 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[224 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[225 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[226 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[227 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[228 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[229 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[230 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[231 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[232 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[233 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[234 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[235 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[236 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[237 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[238 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[239 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[240 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[241 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[242 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[243 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[244 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[245 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[246 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[247 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[248 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[249 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[250 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[251 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[252 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[253 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[254 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[255 - 128]);
+    /* non-generated */
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_empty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(tuple_empty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(hamt_bitmap_node_empty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_INTERP_SINGLETON(interp, hamt_empty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(context_token_missing));
+}
+#endif  // Py_DEBUG
+/* End auto-generated code */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GLOBAL_OBJECTS_FINI_GENERATED_INIT_H */
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..cad2d1a8d22049ffee8b257b6a1db4fef26ac365
--- /dev/null
+++ b/Include/internal/pycore_global_strings.h
@@ -0,0 +1,814 @@
+#ifndef Py_INTERNAL_GLOBAL_STRINGS_H
+#define Py_INTERNAL_GLOBAL_STRINGS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// The data structure & init here are inspired by Tools/build/deepfreeze.py.
+
+// All field names generated by ASCII_STR() have a common prefix,
+// to help avoid collisions with keywords, macros, etc.
+
+#define STRUCT_FOR_ASCII_STR(LITERAL) \
+    struct { \
+        PyASCIIObject _ascii; \
+        uint8_t _data[sizeof(LITERAL)]; \
+    }
+#define STRUCT_FOR_STR(NAME, LITERAL) \
+    STRUCT_FOR_ASCII_STR(LITERAL) _py_ ## NAME;
+#define STRUCT_FOR_ID(NAME) \
+    STRUCT_FOR_ASCII_STR(#NAME) _py_ ## NAME;
+
+// XXX Order by frequency of use?
+
+/* The following is auto-generated by Tools/build/generate_global_objects.py. */
+struct _Py_global_strings {
+    struct {
+        STRUCT_FOR_STR(anon_dictcomp, "<dictcomp>")
+        STRUCT_FOR_STR(anon_genexpr, "<genexpr>")
+        STRUCT_FOR_STR(anon_lambda, "<lambda>")
+        STRUCT_FOR_STR(anon_listcomp, "<listcomp>")
+        STRUCT_FOR_STR(anon_module, "<module>")
+        STRUCT_FOR_STR(anon_null, "<NULL>")
+        STRUCT_FOR_STR(anon_setcomp, "<setcomp>")
+        STRUCT_FOR_STR(anon_string, "<string>")
+        STRUCT_FOR_STR(anon_unknown, "<unknown>")
+        STRUCT_FOR_STR(dbl_close_br, "}}")
+        STRUCT_FOR_STR(dbl_open_br, "{{")
+        STRUCT_FOR_STR(dbl_percent, "%%")
+        STRUCT_FOR_STR(defaults, ".defaults")
+        STRUCT_FOR_STR(dot_locals, ".<locals>")
+        STRUCT_FOR_STR(empty, "")
+        STRUCT_FOR_STR(generic_base, ".generic_base")
+        STRUCT_FOR_STR(json_decoder, "json.decoder")
+        STRUCT_FOR_STR(kwdefaults, ".kwdefaults")
+        STRUCT_FOR_STR(list_err, "list index out of range")
+        STRUCT_FOR_STR(str_replace_inf, "1e309")
+        STRUCT_FOR_STR(type_params, ".type_params")
+        STRUCT_FOR_STR(utf_8, "utf-8")
+    } literals;
+
+    struct {
+        STRUCT_FOR_ID(CANCELLED)
+        STRUCT_FOR_ID(FINISHED)
+        STRUCT_FOR_ID(False)
+        STRUCT_FOR_ID(JSONDecodeError)
+        STRUCT_FOR_ID(PENDING)
+        STRUCT_FOR_ID(Py_Repr)
+        STRUCT_FOR_ID(TextIOWrapper)
+        STRUCT_FOR_ID(True)
+        STRUCT_FOR_ID(WarningMessage)
+        STRUCT_FOR_ID(_WindowsConsoleIO)
+        STRUCT_FOR_ID(__IOBase_closed)
+        STRUCT_FOR_ID(__abc_tpflags__)
+        STRUCT_FOR_ID(__abs__)
+        STRUCT_FOR_ID(__abstractmethods__)
+        STRUCT_FOR_ID(__add__)
+        STRUCT_FOR_ID(__aenter__)
+        STRUCT_FOR_ID(__aexit__)
+        STRUCT_FOR_ID(__aiter__)
+        STRUCT_FOR_ID(__all__)
+        STRUCT_FOR_ID(__and__)
+        STRUCT_FOR_ID(__anext__)
+        STRUCT_FOR_ID(__annotations__)
+        STRUCT_FOR_ID(__args__)
+        STRUCT_FOR_ID(__await__)
+        STRUCT_FOR_ID(__bases__)
+        STRUCT_FOR_ID(__bool__)
+        STRUCT_FOR_ID(__buffer__)
+        STRUCT_FOR_ID(__build_class__)
+        STRUCT_FOR_ID(__builtins__)
+        STRUCT_FOR_ID(__bytes__)
+        STRUCT_FOR_ID(__call__)
+        STRUCT_FOR_ID(__cantrace__)
+        STRUCT_FOR_ID(__class__)
+        STRUCT_FOR_ID(__class_getitem__)
+        STRUCT_FOR_ID(__classcell__)
+        STRUCT_FOR_ID(__classdict__)
+        STRUCT_FOR_ID(__classdictcell__)
+        STRUCT_FOR_ID(__complex__)
+        STRUCT_FOR_ID(__contains__)
+        STRUCT_FOR_ID(__copy__)
+        STRUCT_FOR_ID(__ctypes_from_outparam__)
+        STRUCT_FOR_ID(__del__)
+        STRUCT_FOR_ID(__delattr__)
+        STRUCT_FOR_ID(__delete__)
+        STRUCT_FOR_ID(__delitem__)
+        STRUCT_FOR_ID(__dict__)
+        STRUCT_FOR_ID(__dictoffset__)
+        STRUCT_FOR_ID(__dir__)
+        STRUCT_FOR_ID(__divmod__)
+        STRUCT_FOR_ID(__doc__)
+        STRUCT_FOR_ID(__enter__)
+        STRUCT_FOR_ID(__eq__)
+        STRUCT_FOR_ID(__exit__)
+        STRUCT_FOR_ID(__file__)
+        STRUCT_FOR_ID(__firstlineno__)
+        STRUCT_FOR_ID(__float__)
+        STRUCT_FOR_ID(__floordiv__)
+        STRUCT_FOR_ID(__format__)
+        STRUCT_FOR_ID(__fspath__)
+        STRUCT_FOR_ID(__ge__)
+        STRUCT_FOR_ID(__get__)
+        STRUCT_FOR_ID(__getattr__)
+        STRUCT_FOR_ID(__getattribute__)
+        STRUCT_FOR_ID(__getinitargs__)
+        STRUCT_FOR_ID(__getitem__)
+        STRUCT_FOR_ID(__getnewargs__)
+        STRUCT_FOR_ID(__getnewargs_ex__)
+        STRUCT_FOR_ID(__getstate__)
+        STRUCT_FOR_ID(__gt__)
+        STRUCT_FOR_ID(__hash__)
+        STRUCT_FOR_ID(__iadd__)
+        STRUCT_FOR_ID(__iand__)
+        STRUCT_FOR_ID(__ifloordiv__)
+        STRUCT_FOR_ID(__ilshift__)
+        STRUCT_FOR_ID(__imatmul__)
+        STRUCT_FOR_ID(__imod__)
+        STRUCT_FOR_ID(__import__)
+        STRUCT_FOR_ID(__imul__)
+        STRUCT_FOR_ID(__index__)
+        STRUCT_FOR_ID(__init__)
+        STRUCT_FOR_ID(__init_subclass__)
+        STRUCT_FOR_ID(__instancecheck__)
+        STRUCT_FOR_ID(__int__)
+        STRUCT_FOR_ID(__invert__)
+        STRUCT_FOR_ID(__ior__)
+        STRUCT_FOR_ID(__ipow__)
+        STRUCT_FOR_ID(__irshift__)
+        STRUCT_FOR_ID(__isabstractmethod__)
+        STRUCT_FOR_ID(__isub__)
+        STRUCT_FOR_ID(__iter__)
+        STRUCT_FOR_ID(__itruediv__)
+        STRUCT_FOR_ID(__ixor__)
+        STRUCT_FOR_ID(__le__)
+        STRUCT_FOR_ID(__len__)
+        STRUCT_FOR_ID(__length_hint__)
+        STRUCT_FOR_ID(__lltrace__)
+        STRUCT_FOR_ID(__loader__)
+        STRUCT_FOR_ID(__lshift__)
+        STRUCT_FOR_ID(__lt__)
+        STRUCT_FOR_ID(__main__)
+        STRUCT_FOR_ID(__match_args__)
+        STRUCT_FOR_ID(__matmul__)
+        STRUCT_FOR_ID(__missing__)
+        STRUCT_FOR_ID(__mod__)
+        STRUCT_FOR_ID(__module__)
+        STRUCT_FOR_ID(__mro_entries__)
+        STRUCT_FOR_ID(__mul__)
+        STRUCT_FOR_ID(__name__)
+        STRUCT_FOR_ID(__ne__)
+        STRUCT_FOR_ID(__neg__)
+        STRUCT_FOR_ID(__new__)
+        STRUCT_FOR_ID(__newobj__)
+        STRUCT_FOR_ID(__newobj_ex__)
+        STRUCT_FOR_ID(__next__)
+        STRUCT_FOR_ID(__notes__)
+        STRUCT_FOR_ID(__or__)
+        STRUCT_FOR_ID(__orig_class__)
+        STRUCT_FOR_ID(__origin__)
+        STRUCT_FOR_ID(__package__)
+        STRUCT_FOR_ID(__parameters__)
+        STRUCT_FOR_ID(__path__)
+        STRUCT_FOR_ID(__pos__)
+        STRUCT_FOR_ID(__pow__)
+        STRUCT_FOR_ID(__prepare__)
+        STRUCT_FOR_ID(__qualname__)
+        STRUCT_FOR_ID(__radd__)
+        STRUCT_FOR_ID(__rand__)
+        STRUCT_FOR_ID(__rdivmod__)
+        STRUCT_FOR_ID(__reduce__)
+        STRUCT_FOR_ID(__reduce_ex__)
+        STRUCT_FOR_ID(__release_buffer__)
+        STRUCT_FOR_ID(__repr__)
+        STRUCT_FOR_ID(__reversed__)
+        STRUCT_FOR_ID(__rfloordiv__)
+        STRUCT_FOR_ID(__rlshift__)
+        STRUCT_FOR_ID(__rmatmul__)
+        STRUCT_FOR_ID(__rmod__)
+        STRUCT_FOR_ID(__rmul__)
+        STRUCT_FOR_ID(__ror__)
+        STRUCT_FOR_ID(__round__)
+        STRUCT_FOR_ID(__rpow__)
+        STRUCT_FOR_ID(__rrshift__)
+        STRUCT_FOR_ID(__rshift__)
+        STRUCT_FOR_ID(__rsub__)
+        STRUCT_FOR_ID(__rtruediv__)
+        STRUCT_FOR_ID(__rxor__)
+        STRUCT_FOR_ID(__set__)
+        STRUCT_FOR_ID(__set_name__)
+        STRUCT_FOR_ID(__setattr__)
+        STRUCT_FOR_ID(__setitem__)
+        STRUCT_FOR_ID(__setstate__)
+        STRUCT_FOR_ID(__sizeof__)
+        STRUCT_FOR_ID(__slotnames__)
+        STRUCT_FOR_ID(__slots__)
+        STRUCT_FOR_ID(__spec__)
+        STRUCT_FOR_ID(__static_attributes__)
+        STRUCT_FOR_ID(__str__)
+        STRUCT_FOR_ID(__sub__)
+        STRUCT_FOR_ID(__subclasscheck__)
+        STRUCT_FOR_ID(__subclasshook__)
+        STRUCT_FOR_ID(__truediv__)
+        STRUCT_FOR_ID(__trunc__)
+        STRUCT_FOR_ID(__type_params__)
+        STRUCT_FOR_ID(__typing_is_unpacked_typevartuple__)
+        STRUCT_FOR_ID(__typing_prepare_subst__)
+        STRUCT_FOR_ID(__typing_subst__)
+        STRUCT_FOR_ID(__typing_unpacked_tuple_args__)
+        STRUCT_FOR_ID(__warningregistry__)
+        STRUCT_FOR_ID(__weaklistoffset__)
+        STRUCT_FOR_ID(__weakref__)
+        STRUCT_FOR_ID(__xor__)
+        STRUCT_FOR_ID(_abc_impl)
+        STRUCT_FOR_ID(_abstract_)
+        STRUCT_FOR_ID(_active)
+        STRUCT_FOR_ID(_align_)
+        STRUCT_FOR_ID(_annotation)
+        STRUCT_FOR_ID(_anonymous_)
+        STRUCT_FOR_ID(_argtypes_)
+        STRUCT_FOR_ID(_as_parameter_)
+        STRUCT_FOR_ID(_asyncio_future_blocking)
+        STRUCT_FOR_ID(_blksize)
+        STRUCT_FOR_ID(_bootstrap)
+        STRUCT_FOR_ID(_check_retval_)
+        STRUCT_FOR_ID(_dealloc_warn)
+        STRUCT_FOR_ID(_feature_version)
+        STRUCT_FOR_ID(_field_types)
+        STRUCT_FOR_ID(_fields_)
+        STRUCT_FOR_ID(_finalizing)
+        STRUCT_FOR_ID(_find_and_load)
+        STRUCT_FOR_ID(_fix_up_module)
+        STRUCT_FOR_ID(_flags_)
+        STRUCT_FOR_ID(_get_sourcefile)
+        STRUCT_FOR_ID(_handle_fromlist)
+        STRUCT_FOR_ID(_initializing)
+        STRUCT_FOR_ID(_io)
+        STRUCT_FOR_ID(_is_text_encoding)
+        STRUCT_FOR_ID(_length_)
+        STRUCT_FOR_ID(_limbo)
+        STRUCT_FOR_ID(_lock_unlock_module)
+        STRUCT_FOR_ID(_loop)
+        STRUCT_FOR_ID(_needs_com_addref_)
+        STRUCT_FOR_ID(_only_immortal)
+        STRUCT_FOR_ID(_pack_)
+        STRUCT_FOR_ID(_restype_)
+        STRUCT_FOR_ID(_showwarnmsg)
+        STRUCT_FOR_ID(_shutdown)
+        STRUCT_FOR_ID(_slotnames)
+        STRUCT_FOR_ID(_strptime)
+        STRUCT_FOR_ID(_strptime_datetime)
+        STRUCT_FOR_ID(_swappedbytes_)
+        STRUCT_FOR_ID(_type_)
+        STRUCT_FOR_ID(_uninitialized_submodules)
+        STRUCT_FOR_ID(_warn_unawaited_coroutine)
+        STRUCT_FOR_ID(_xoptions)
+        STRUCT_FOR_ID(abs_tol)
+        STRUCT_FOR_ID(access)
+        STRUCT_FOR_ID(aclose)
+        STRUCT_FOR_ID(add)
+        STRUCT_FOR_ID(add_done_callback)
+        STRUCT_FOR_ID(after_in_child)
+        STRUCT_FOR_ID(after_in_parent)
+        STRUCT_FOR_ID(aggregate_class)
+        STRUCT_FOR_ID(alias)
+        STRUCT_FOR_ID(allow_code)
+        STRUCT_FOR_ID(append)
+        STRUCT_FOR_ID(arg)
+        STRUCT_FOR_ID(argdefs)
+        STRUCT_FOR_ID(args)
+        STRUCT_FOR_ID(arguments)
+        STRUCT_FOR_ID(argv)
+        STRUCT_FOR_ID(as_integer_ratio)
+        STRUCT_FOR_ID(asend)
+        STRUCT_FOR_ID(ast)
+        STRUCT_FOR_ID(athrow)
+        STRUCT_FOR_ID(attribute)
+        STRUCT_FOR_ID(authorizer_callback)
+        STRUCT_FOR_ID(autocommit)
+        STRUCT_FOR_ID(backtick)
+        STRUCT_FOR_ID(base)
+        STRUCT_FOR_ID(before)
+        STRUCT_FOR_ID(big)
+        STRUCT_FOR_ID(binary_form)
+        STRUCT_FOR_ID(block)
+        STRUCT_FOR_ID(bound)
+        STRUCT_FOR_ID(buffer)
+        STRUCT_FOR_ID(buffer_callback)
+        STRUCT_FOR_ID(buffer_size)
+        STRUCT_FOR_ID(buffering)
+        STRUCT_FOR_ID(buffers)
+        STRUCT_FOR_ID(bufsize)
+        STRUCT_FOR_ID(builtins)
+        STRUCT_FOR_ID(byteorder)
+        STRUCT_FOR_ID(bytes)
+        STRUCT_FOR_ID(bytes_per_sep)
+        STRUCT_FOR_ID(c_call)
+        STRUCT_FOR_ID(c_exception)
+        STRUCT_FOR_ID(c_return)
+        STRUCT_FOR_ID(cached_datetime_module)
+        STRUCT_FOR_ID(cached_statements)
+        STRUCT_FOR_ID(cadata)
+        STRUCT_FOR_ID(cafile)
+        STRUCT_FOR_ID(call)
+        STRUCT_FOR_ID(call_exception_handler)
+        STRUCT_FOR_ID(call_soon)
+        STRUCT_FOR_ID(callback)
+        STRUCT_FOR_ID(cancel)
+        STRUCT_FOR_ID(capath)
+        STRUCT_FOR_ID(category)
+        STRUCT_FOR_ID(cb_type)
+        STRUCT_FOR_ID(certfile)
+        STRUCT_FOR_ID(check_same_thread)
+        STRUCT_FOR_ID(clear)
+        STRUCT_FOR_ID(close)
+        STRUCT_FOR_ID(closed)
+        STRUCT_FOR_ID(closefd)
+        STRUCT_FOR_ID(closure)
+        STRUCT_FOR_ID(co_argcount)
+        STRUCT_FOR_ID(co_cellvars)
+        STRUCT_FOR_ID(co_code)
+        STRUCT_FOR_ID(co_consts)
+        STRUCT_FOR_ID(co_exceptiontable)
+        STRUCT_FOR_ID(co_filename)
+        STRUCT_FOR_ID(co_firstlineno)
+        STRUCT_FOR_ID(co_flags)
+        STRUCT_FOR_ID(co_freevars)
+        STRUCT_FOR_ID(co_kwonlyargcount)
+        STRUCT_FOR_ID(co_linetable)
+        STRUCT_FOR_ID(co_name)
+        STRUCT_FOR_ID(co_names)
+        STRUCT_FOR_ID(co_nlocals)
+        STRUCT_FOR_ID(co_posonlyargcount)
+        STRUCT_FOR_ID(co_qualname)
+        STRUCT_FOR_ID(co_stacksize)
+        STRUCT_FOR_ID(co_varnames)
+        STRUCT_FOR_ID(code)
+        STRUCT_FOR_ID(col_offset)
+        STRUCT_FOR_ID(command)
+        STRUCT_FOR_ID(comment_factory)
+        STRUCT_FOR_ID(compile_mode)
+        STRUCT_FOR_ID(consts)
+        STRUCT_FOR_ID(context)
+        STRUCT_FOR_ID(contravariant)
+        STRUCT_FOR_ID(cookie)
+        STRUCT_FOR_ID(copy)
+        STRUCT_FOR_ID(copyreg)
+        STRUCT_FOR_ID(coro)
+        STRUCT_FOR_ID(count)
+        STRUCT_FOR_ID(covariant)
+        STRUCT_FOR_ID(cwd)
+        STRUCT_FOR_ID(data)
+        STRUCT_FOR_ID(database)
+        STRUCT_FOR_ID(day)
+        STRUCT_FOR_ID(decode)
+        STRUCT_FOR_ID(decoder)
+        STRUCT_FOR_ID(default)
+        STRUCT_FOR_ID(defaultaction)
+        STRUCT_FOR_ID(delete)
+        STRUCT_FOR_ID(depth)
+        STRUCT_FOR_ID(desired_access)
+        STRUCT_FOR_ID(detect_types)
+        STRUCT_FOR_ID(deterministic)
+        STRUCT_FOR_ID(device)
+        STRUCT_FOR_ID(dict)
+        STRUCT_FOR_ID(dictcomp)
+        STRUCT_FOR_ID(difference_update)
+        STRUCT_FOR_ID(digest)
+        STRUCT_FOR_ID(digest_size)
+        STRUCT_FOR_ID(digestmod)
+        STRUCT_FOR_ID(dir_fd)
+        STRUCT_FOR_ID(discard)
+        STRUCT_FOR_ID(dispatch_table)
+        STRUCT_FOR_ID(displayhook)
+        STRUCT_FOR_ID(dklen)
+        STRUCT_FOR_ID(doc)
+        STRUCT_FOR_ID(dont_inherit)
+        STRUCT_FOR_ID(dst)
+        STRUCT_FOR_ID(dst_dir_fd)
+        STRUCT_FOR_ID(eager_start)
+        STRUCT_FOR_ID(effective_ids)
+        STRUCT_FOR_ID(element_factory)
+        STRUCT_FOR_ID(encode)
+        STRUCT_FOR_ID(encoding)
+        STRUCT_FOR_ID(end)
+        STRUCT_FOR_ID(end_col_offset)
+        STRUCT_FOR_ID(end_lineno)
+        STRUCT_FOR_ID(end_offset)
+        STRUCT_FOR_ID(endpos)
+        STRUCT_FOR_ID(entrypoint)
+        STRUCT_FOR_ID(env)
+        STRUCT_FOR_ID(errors)
+        STRUCT_FOR_ID(event)
+        STRUCT_FOR_ID(eventmask)
+        STRUCT_FOR_ID(exc_type)
+        STRUCT_FOR_ID(exc_value)
+        STRUCT_FOR_ID(excepthook)
+        STRUCT_FOR_ID(exception)
+        STRUCT_FOR_ID(existing_file_name)
+        STRUCT_FOR_ID(exp)
+        STRUCT_FOR_ID(extend)
+        STRUCT_FOR_ID(extra_tokens)
+        STRUCT_FOR_ID(facility)
+        STRUCT_FOR_ID(factory)
+        STRUCT_FOR_ID(false)
+        STRUCT_FOR_ID(family)
+        STRUCT_FOR_ID(fanout)
+        STRUCT_FOR_ID(fd)
+        STRUCT_FOR_ID(fd2)
+        STRUCT_FOR_ID(fdel)
+        STRUCT_FOR_ID(fget)
+        STRUCT_FOR_ID(file)
+        STRUCT_FOR_ID(file_actions)
+        STRUCT_FOR_ID(filename)
+        STRUCT_FOR_ID(fileno)
+        STRUCT_FOR_ID(filepath)
+        STRUCT_FOR_ID(fillvalue)
+        STRUCT_FOR_ID(filter)
+        STRUCT_FOR_ID(filters)
+        STRUCT_FOR_ID(final)
+        STRUCT_FOR_ID(find_class)
+        STRUCT_FOR_ID(fix_imports)
+        STRUCT_FOR_ID(flags)
+        STRUCT_FOR_ID(flush)
+        STRUCT_FOR_ID(fold)
+        STRUCT_FOR_ID(follow_symlinks)
+        STRUCT_FOR_ID(format)
+        STRUCT_FOR_ID(from_param)
+        STRUCT_FOR_ID(fromlist)
+        STRUCT_FOR_ID(fromtimestamp)
+        STRUCT_FOR_ID(fromutc)
+        STRUCT_FOR_ID(fset)
+        STRUCT_FOR_ID(func)
+        STRUCT_FOR_ID(future)
+        STRUCT_FOR_ID(generation)
+        STRUCT_FOR_ID(genexpr)
+        STRUCT_FOR_ID(get)
+        STRUCT_FOR_ID(get_debug)
+        STRUCT_FOR_ID(get_event_loop)
+        STRUCT_FOR_ID(get_loop)
+        STRUCT_FOR_ID(get_source)
+        STRUCT_FOR_ID(getattr)
+        STRUCT_FOR_ID(getstate)
+        STRUCT_FOR_ID(gid)
+        STRUCT_FOR_ID(globals)
+        STRUCT_FOR_ID(groupindex)
+        STRUCT_FOR_ID(groups)
+        STRUCT_FOR_ID(handle)
+        STRUCT_FOR_ID(handle_seq)
+        STRUCT_FOR_ID(has_location)
+        STRUCT_FOR_ID(hash_name)
+        STRUCT_FOR_ID(header)
+        STRUCT_FOR_ID(headers)
+        STRUCT_FOR_ID(hi)
+        STRUCT_FOR_ID(hook)
+        STRUCT_FOR_ID(hour)
+        STRUCT_FOR_ID(ident)
+        STRUCT_FOR_ID(identity_hint)
+        STRUCT_FOR_ID(ignore)
+        STRUCT_FOR_ID(imag)
+        STRUCT_FOR_ID(importlib)
+        STRUCT_FOR_ID(in_fd)
+        STRUCT_FOR_ID(incoming)
+        STRUCT_FOR_ID(indexgroup)
+        STRUCT_FOR_ID(inf)
+        STRUCT_FOR_ID(infer_variance)
+        STRUCT_FOR_ID(inherit_handle)
+        STRUCT_FOR_ID(inheritable)
+        STRUCT_FOR_ID(initial)
+        STRUCT_FOR_ID(initial_bytes)
+        STRUCT_FOR_ID(initial_owner)
+        STRUCT_FOR_ID(initial_state)
+        STRUCT_FOR_ID(initial_value)
+        STRUCT_FOR_ID(initval)
+        STRUCT_FOR_ID(inner_size)
+        STRUCT_FOR_ID(input)
+        STRUCT_FOR_ID(insert_comments)
+        STRUCT_FOR_ID(insert_pis)
+        STRUCT_FOR_ID(instructions)
+        STRUCT_FOR_ID(intern)
+        STRUCT_FOR_ID(intersection)
+        STRUCT_FOR_ID(interval)
+        STRUCT_FOR_ID(is_running)
+        STRUCT_FOR_ID(isatty)
+        STRUCT_FOR_ID(isinstance)
+        STRUCT_FOR_ID(isoformat)
+        STRUCT_FOR_ID(isolation_level)
+        STRUCT_FOR_ID(istext)
+        STRUCT_FOR_ID(item)
+        STRUCT_FOR_ID(items)
+        STRUCT_FOR_ID(iter)
+        STRUCT_FOR_ID(iterable)
+        STRUCT_FOR_ID(iterations)
+        STRUCT_FOR_ID(join)
+        STRUCT_FOR_ID(jump)
+        STRUCT_FOR_ID(keepends)
+        STRUCT_FOR_ID(key)
+        STRUCT_FOR_ID(keyfile)
+        STRUCT_FOR_ID(keys)
+        STRUCT_FOR_ID(kind)
+        STRUCT_FOR_ID(kw)
+        STRUCT_FOR_ID(kw1)
+        STRUCT_FOR_ID(kw2)
+        STRUCT_FOR_ID(kwdefaults)
+        STRUCT_FOR_ID(label)
+        STRUCT_FOR_ID(lambda)
+        STRUCT_FOR_ID(last)
+        STRUCT_FOR_ID(last_exc)
+        STRUCT_FOR_ID(last_node)
+        STRUCT_FOR_ID(last_traceback)
+        STRUCT_FOR_ID(last_type)
+        STRUCT_FOR_ID(last_value)
+        STRUCT_FOR_ID(latin1)
+        STRUCT_FOR_ID(leaf_size)
+        STRUCT_FOR_ID(len)
+        STRUCT_FOR_ID(length)
+        STRUCT_FOR_ID(level)
+        STRUCT_FOR_ID(limit)
+        STRUCT_FOR_ID(line)
+        STRUCT_FOR_ID(line_buffering)
+        STRUCT_FOR_ID(lineno)
+        STRUCT_FOR_ID(listcomp)
+        STRUCT_FOR_ID(little)
+        STRUCT_FOR_ID(lo)
+        STRUCT_FOR_ID(locale)
+        STRUCT_FOR_ID(locals)
+        STRUCT_FOR_ID(logoption)
+        STRUCT_FOR_ID(loop)
+        STRUCT_FOR_ID(manual_reset)
+        STRUCT_FOR_ID(mapping)
+        STRUCT_FOR_ID(match)
+        STRUCT_FOR_ID(max_length)
+        STRUCT_FOR_ID(maxdigits)
+        STRUCT_FOR_ID(maxevents)
+        STRUCT_FOR_ID(maxlen)
+        STRUCT_FOR_ID(maxmem)
+        STRUCT_FOR_ID(maxsplit)
+        STRUCT_FOR_ID(maxvalue)
+        STRUCT_FOR_ID(memLevel)
+        STRUCT_FOR_ID(memlimit)
+        STRUCT_FOR_ID(message)
+        STRUCT_FOR_ID(metaclass)
+        STRUCT_FOR_ID(metadata)
+        STRUCT_FOR_ID(method)
+        STRUCT_FOR_ID(microsecond)
+        STRUCT_FOR_ID(milliseconds)
+        STRUCT_FOR_ID(minute)
+        STRUCT_FOR_ID(mod)
+        STRUCT_FOR_ID(mode)
+        STRUCT_FOR_ID(module)
+        STRUCT_FOR_ID(module_globals)
+        STRUCT_FOR_ID(modules)
+        STRUCT_FOR_ID(month)
+        STRUCT_FOR_ID(mro)
+        STRUCT_FOR_ID(msg)
+        STRUCT_FOR_ID(mutex)
+        STRUCT_FOR_ID(mycmp)
+        STRUCT_FOR_ID(n_arg)
+        STRUCT_FOR_ID(n_fields)
+        STRUCT_FOR_ID(n_sequence_fields)
+        STRUCT_FOR_ID(n_unnamed_fields)
+        STRUCT_FOR_ID(name)
+        STRUCT_FOR_ID(name_from)
+        STRUCT_FOR_ID(namespace_separator)
+        STRUCT_FOR_ID(namespaces)
+        STRUCT_FOR_ID(narg)
+        STRUCT_FOR_ID(ndigits)
+        STRUCT_FOR_ID(nested)
+        STRUCT_FOR_ID(new_file_name)
+        STRUCT_FOR_ID(new_limit)
+        STRUCT_FOR_ID(newline)
+        STRUCT_FOR_ID(newlines)
+        STRUCT_FOR_ID(next)
+        STRUCT_FOR_ID(nlocals)
+        STRUCT_FOR_ID(node_depth)
+        STRUCT_FOR_ID(node_offset)
+        STRUCT_FOR_ID(ns)
+        STRUCT_FOR_ID(nstype)
+        STRUCT_FOR_ID(nt)
+        STRUCT_FOR_ID(null)
+        STRUCT_FOR_ID(number)
+        STRUCT_FOR_ID(obj)
+        STRUCT_FOR_ID(object)
+        STRUCT_FOR_ID(offset)
+        STRUCT_FOR_ID(offset_dst)
+        STRUCT_FOR_ID(offset_src)
+        STRUCT_FOR_ID(on_type_read)
+        STRUCT_FOR_ID(onceregistry)
+        STRUCT_FOR_ID(only_keys)
+        STRUCT_FOR_ID(oparg)
+        STRUCT_FOR_ID(opcode)
+        STRUCT_FOR_ID(open)
+        STRUCT_FOR_ID(opener)
+        STRUCT_FOR_ID(operation)
+        STRUCT_FOR_ID(optimize)
+        STRUCT_FOR_ID(options)
+        STRUCT_FOR_ID(order)
+        STRUCT_FOR_ID(origin)
+        STRUCT_FOR_ID(out_fd)
+        STRUCT_FOR_ID(outgoing)
+        STRUCT_FOR_ID(overlapped)
+        STRUCT_FOR_ID(owner)
+        STRUCT_FOR_ID(pages)
+        STRUCT_FOR_ID(parent)
+        STRUCT_FOR_ID(password)
+        STRUCT_FOR_ID(path)
+        STRUCT_FOR_ID(pattern)
+        STRUCT_FOR_ID(peek)
+        STRUCT_FOR_ID(persistent_id)
+        STRUCT_FOR_ID(persistent_load)
+        STRUCT_FOR_ID(person)
+        STRUCT_FOR_ID(pi_factory)
+        STRUCT_FOR_ID(pid)
+        STRUCT_FOR_ID(policy)
+        STRUCT_FOR_ID(pos)
+        STRUCT_FOR_ID(pos1)
+        STRUCT_FOR_ID(pos2)
+        STRUCT_FOR_ID(posix)
+        STRUCT_FOR_ID(print_file_and_line)
+        STRUCT_FOR_ID(priority)
+        STRUCT_FOR_ID(progress)
+        STRUCT_FOR_ID(progress_handler)
+        STRUCT_FOR_ID(progress_routine)
+        STRUCT_FOR_ID(proto)
+        STRUCT_FOR_ID(protocol)
+        STRUCT_FOR_ID(ps1)
+        STRUCT_FOR_ID(ps2)
+        STRUCT_FOR_ID(query)
+        STRUCT_FOR_ID(quotetabs)
+        STRUCT_FOR_ID(raw)
+        STRUCT_FOR_ID(read)
+        STRUCT_FOR_ID(read1)
+        STRUCT_FOR_ID(readable)
+        STRUCT_FOR_ID(readall)
+        STRUCT_FOR_ID(readinto)
+        STRUCT_FOR_ID(readinto1)
+        STRUCT_FOR_ID(readline)
+        STRUCT_FOR_ID(readonly)
+        STRUCT_FOR_ID(real)
+        STRUCT_FOR_ID(reducer_override)
+        STRUCT_FOR_ID(registry)
+        STRUCT_FOR_ID(rel_tol)
+        STRUCT_FOR_ID(release)
+        STRUCT_FOR_ID(reload)
+        STRUCT_FOR_ID(repl)
+        STRUCT_FOR_ID(replace)
+        STRUCT_FOR_ID(reserved)
+        STRUCT_FOR_ID(reset)
+        STRUCT_FOR_ID(resetids)
+        STRUCT_FOR_ID(return)
+        STRUCT_FOR_ID(reverse)
+        STRUCT_FOR_ID(reversed)
+        STRUCT_FOR_ID(salt)
+        STRUCT_FOR_ID(sched_priority)
+        STRUCT_FOR_ID(scheduler)
+        STRUCT_FOR_ID(second)
+        STRUCT_FOR_ID(security_attributes)
+        STRUCT_FOR_ID(seek)
+        STRUCT_FOR_ID(seekable)
+        STRUCT_FOR_ID(selectors)
+        STRUCT_FOR_ID(self)
+        STRUCT_FOR_ID(send)
+        STRUCT_FOR_ID(sep)
+        STRUCT_FOR_ID(sequence)
+        STRUCT_FOR_ID(server_hostname)
+        STRUCT_FOR_ID(server_side)
+        STRUCT_FOR_ID(session)
+        STRUCT_FOR_ID(setcomp)
+        STRUCT_FOR_ID(setpgroup)
+        STRUCT_FOR_ID(setsid)
+        STRUCT_FOR_ID(setsigdef)
+        STRUCT_FOR_ID(setsigmask)
+        STRUCT_FOR_ID(setstate)
+        STRUCT_FOR_ID(shape)
+        STRUCT_FOR_ID(show_cmd)
+        STRUCT_FOR_ID(signed)
+        STRUCT_FOR_ID(size)
+        STRUCT_FOR_ID(sizehint)
+        STRUCT_FOR_ID(skip_file_prefixes)
+        STRUCT_FOR_ID(sleep)
+        STRUCT_FOR_ID(sock)
+        STRUCT_FOR_ID(sort)
+        STRUCT_FOR_ID(source)
+        STRUCT_FOR_ID(source_traceback)
+        STRUCT_FOR_ID(spam)
+        STRUCT_FOR_ID(src)
+        STRUCT_FOR_ID(src_dir_fd)
+        STRUCT_FOR_ID(stacklevel)
+        STRUCT_FOR_ID(start)
+        STRUCT_FOR_ID(statement)
+        STRUCT_FOR_ID(status)
+        STRUCT_FOR_ID(stderr)
+        STRUCT_FOR_ID(stdin)
+        STRUCT_FOR_ID(stdout)
+        STRUCT_FOR_ID(step)
+        STRUCT_FOR_ID(steps)
+        STRUCT_FOR_ID(store_name)
+        STRUCT_FOR_ID(strategy)
+        STRUCT_FOR_ID(strftime)
+        STRUCT_FOR_ID(strict)
+        STRUCT_FOR_ID(strict_mode)
+        STRUCT_FOR_ID(string)
+        STRUCT_FOR_ID(sub_key)
+        STRUCT_FOR_ID(symmetric_difference_update)
+        STRUCT_FOR_ID(tabsize)
+        STRUCT_FOR_ID(tag)
+        STRUCT_FOR_ID(target)
+        STRUCT_FOR_ID(target_is_directory)
+        STRUCT_FOR_ID(task)
+        STRUCT_FOR_ID(tb_frame)
+        STRUCT_FOR_ID(tb_lasti)
+        STRUCT_FOR_ID(tb_lineno)
+        STRUCT_FOR_ID(tb_next)
+        STRUCT_FOR_ID(tell)
+        STRUCT_FOR_ID(template)
+        STRUCT_FOR_ID(term)
+        STRUCT_FOR_ID(text)
+        STRUCT_FOR_ID(threading)
+        STRUCT_FOR_ID(throw)
+        STRUCT_FOR_ID(timeout)
+        STRUCT_FOR_ID(times)
+        STRUCT_FOR_ID(timetuple)
+        STRUCT_FOR_ID(top)
+        STRUCT_FOR_ID(trace_callback)
+        STRUCT_FOR_ID(traceback)
+        STRUCT_FOR_ID(trailers)
+        STRUCT_FOR_ID(translate)
+        STRUCT_FOR_ID(true)
+        STRUCT_FOR_ID(truncate)
+        STRUCT_FOR_ID(twice)
+        STRUCT_FOR_ID(txt)
+        STRUCT_FOR_ID(type)
+        STRUCT_FOR_ID(type_params)
+        STRUCT_FOR_ID(tz)
+        STRUCT_FOR_ID(tzinfo)
+        STRUCT_FOR_ID(tzname)
+        STRUCT_FOR_ID(uid)
+        STRUCT_FOR_ID(unlink)
+        STRUCT_FOR_ID(unraisablehook)
+        STRUCT_FOR_ID(uri)
+        STRUCT_FOR_ID(usedforsecurity)
+        STRUCT_FOR_ID(value)
+        STRUCT_FOR_ID(values)
+        STRUCT_FOR_ID(version)
+        STRUCT_FOR_ID(volume)
+        STRUCT_FOR_ID(wait_all)
+        STRUCT_FOR_ID(warn_on_full_buffer)
+        STRUCT_FOR_ID(warnings)
+        STRUCT_FOR_ID(warnoptions)
+        STRUCT_FOR_ID(wbits)
+        STRUCT_FOR_ID(week)
+        STRUCT_FOR_ID(weekday)
+        STRUCT_FOR_ID(which)
+        STRUCT_FOR_ID(who)
+        STRUCT_FOR_ID(withdata)
+        STRUCT_FOR_ID(writable)
+        STRUCT_FOR_ID(write)
+        STRUCT_FOR_ID(write_through)
+        STRUCT_FOR_ID(year)
+        STRUCT_FOR_ID(zdict)
+    } identifiers;
+    struct {
+        PyASCIIObject _ascii;
+        uint8_t _data[2];
+    } ascii[128];
+    struct {
+        PyCompactUnicodeObject _latin1;
+        uint8_t _data[2];
+    } latin1[128];
+};
+/* End auto-generated code */
+
+#undef ID
+#undef STR
+
+
+#define _Py_ID(NAME) \
+     (_Py_SINGLETON(strings.identifiers._py_ ## NAME._ascii.ob_base))
+#define _Py_STR(NAME) \
+     (_Py_SINGLETON(strings.literals._py_ ## NAME._ascii.ob_base))
+#define _Py_LATIN1_CHR(CH) \
+    ((CH) < 128 \
+     ? (PyObject*)&_Py_SINGLETON(strings).ascii[(CH)] \
+     : (PyObject*)&_Py_SINGLETON(strings).latin1[(CH) - 128])
+
+/* _Py_DECLARE_STR() should precede all uses of _Py_STR() in a function.
+
+   This is true even if the same string has already been declared
+   elsewhere, even in the same file.  Mismatched duplicates are detected
+   by Tools/scripts/generate-global-objects.py.
+
+   Pairing _Py_DECLARE_STR() with every use of _Py_STR() makes sure the
+   string keeps working even if the declaration is removed somewhere
+   else.  It also makes it clear what the actual string is at every
+   place it is being used. */
+#define _Py_DECLARE_STR(name, str)
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GLOBAL_STRINGS_H */
diff --git a/Include/internal/pycore_hamt.h b/Include/internal/pycore_hamt.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8742c7cb6357833c7d1b5326646ef6375953e46
--- /dev/null
+++ b/Include/internal/pycore_hamt.h
@@ -0,0 +1,134 @@
+#ifndef Py_INTERNAL_HAMT_H
+#define Py_INTERNAL_HAMT_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/*
+HAMT tree is shaped by hashes of keys. Every group of 5 bits of a hash denotes
+the exact position of the key in one level of the tree. Since we're using
+32 bit hashes, we can have at most 7 such levels. Although if there are
+two distinct keys with equal hashes, they will have to occupy the same
+cell in the 7th level of the tree -- so we'd put them in a "collision" node.
+Which brings the total possible tree depth to 8. Read more about the actual
+layout of the HAMT tree in `hamt.c`.
+
+This constant is used to define a datastucture for storing iteration state.
+*/
+#define _Py_HAMT_MAX_TREE_DEPTH 8
+
+
+extern PyTypeObject _PyHamt_Type;
+extern PyTypeObject _PyHamt_ArrayNode_Type;
+extern PyTypeObject _PyHamt_BitmapNode_Type;
+extern PyTypeObject _PyHamt_CollisionNode_Type;
+extern PyTypeObject _PyHamtKeys_Type;
+extern PyTypeObject _PyHamtValues_Type;
+extern PyTypeObject _PyHamtItems_Type;
+
+
+/* other API */
+
+#define PyHamt_Check(o) Py_IS_TYPE((o), &_PyHamt_Type)
+
+
+/* Abstract tree node. */
+typedef struct {
+    PyObject_HEAD
+} PyHamtNode;
+
+
+/* An HAMT immutable mapping collection. */
+typedef struct {
+    PyObject_HEAD
+    PyHamtNode *h_root;
+    PyObject *h_weakreflist;
+    Py_ssize_t h_count;
+} PyHamtObject;
+
+
+typedef struct {
+    PyObject_VAR_HEAD
+    uint32_t b_bitmap;
+    PyObject *b_array[1];
+} PyHamtNode_Bitmap;
+
+
+/* A struct to hold the state of depth-first traverse of the tree.
+
+   HAMT is an immutable collection.  Iterators will hold a strong reference
+   to it, and every node in the HAMT has strong references to its children.
+
+   So for iterators, we can implement zero allocations and zero reference
+   inc/dec depth-first iteration.
+
+   - i_nodes: an array of seven pointers to tree nodes
+   - i_level: the current node in i_nodes
+   - i_pos: an array of positions within nodes in i_nodes.
+*/
+typedef struct {
+    PyHamtNode *i_nodes[_Py_HAMT_MAX_TREE_DEPTH];
+    Py_ssize_t i_pos[_Py_HAMT_MAX_TREE_DEPTH];
+    int8_t i_level;
+} PyHamtIteratorState;
+
+
+/* Base iterator object.
+
+   Contains the iteration state, a pointer to the HAMT tree,
+   and a pointer to the 'yield function'.  The latter is a simple
+   function that returns a key/value tuple for the 'Items' iterator,
+   just a key for the 'Keys' iterator, and a value for the 'Values'
+   iterator.
+*/
+typedef struct {
+    PyObject_HEAD
+    PyHamtObject *hi_obj;
+    PyHamtIteratorState hi_iter;
+    binaryfunc hi_yield;
+} PyHamtIterator;
+
+
+/* Create a new HAMT immutable mapping. */
+PyHamtObject * _PyHamt_New(void);
+
+/* Return a new collection based on "o", but with an additional
+   key/val pair. */
+PyHamtObject * _PyHamt_Assoc(PyHamtObject *o, PyObject *key, PyObject *val);
+
+/* Return a new collection based on "o", but without "key". */
+PyHamtObject * _PyHamt_Without(PyHamtObject *o, PyObject *key);
+
+/* Find "key" in the "o" collection.
+
+   Return:
+   - -1: An error occurred.
+   - 0: "key" wasn't found in "o".
+   - 1: "key" is in "o"; "*val" is set to its value (a borrowed ref).
+*/
+int _PyHamt_Find(PyHamtObject *o, PyObject *key, PyObject **val);
+
+/* Check if "v" is equal to "w".
+
+   Return:
+   - 0: v != w
+   - 1: v == w
+   - -1: An error occurred.
+*/
+int _PyHamt_Eq(PyHamtObject *v, PyHamtObject *w);
+
+/* Return the size of "o"; equivalent of "len(o)". */
+Py_ssize_t _PyHamt_Len(PyHamtObject *o);
+
+/* Return a Keys iterator over "o". */
+PyObject * _PyHamt_NewIterKeys(PyHamtObject *o);
+
+/* Return a Values iterator over "o". */
+PyObject * _PyHamt_NewIterValues(PyHamtObject *o);
+
+/* Return a Items iterator over "o". */
+PyObject * _PyHamt_NewIterItems(PyHamtObject *o);
+
+#endif /* !Py_INTERNAL_HAMT_H */
diff --git a/Include/internal/pycore_hashtable.h b/Include/internal/pycore_hashtable.h
new file mode 100644
index 0000000000000000000000000000000000000000..369d49c42bbfccde2e72c2adf5611a5322fb06c6
--- /dev/null
+++ b/Include/internal/pycore_hashtable.h
@@ -0,0 +1,150 @@
+#ifndef Py_INTERNAL_HASHTABLE_H
+#define Py_INTERNAL_HASHTABLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* Single linked list */
+
+typedef struct _Py_slist_item_s {
+    struct _Py_slist_item_s *next;
+} _Py_slist_item_t;
+
+typedef struct {
+    _Py_slist_item_t *head;
+} _Py_slist_t;
+
+#define _Py_SLIST_ITEM_NEXT(ITEM) _Py_RVALUE(((_Py_slist_item_t *)(ITEM))->next)
+
+#define _Py_SLIST_HEAD(SLIST) _Py_RVALUE(((_Py_slist_t *)(SLIST))->head)
+
+
+/* _Py_hashtable: table entry */
+
+typedef struct {
+    /* used by _Py_hashtable_t.buckets to link entries */
+    _Py_slist_item_t _Py_slist_item;
+
+    Py_uhash_t key_hash;
+    void *key;
+    void *value;
+} _Py_hashtable_entry_t;
+
+
+/* _Py_hashtable: prototypes */
+
+/* Forward declaration */
+struct _Py_hashtable_t;
+typedef struct _Py_hashtable_t _Py_hashtable_t;
+
+typedef Py_uhash_t (*_Py_hashtable_hash_func) (const void *key);
+typedef int (*_Py_hashtable_compare_func) (const void *key1, const void *key2);
+typedef void (*_Py_hashtable_destroy_func) (void *key);
+typedef _Py_hashtable_entry_t* (*_Py_hashtable_get_entry_func)(_Py_hashtable_t *ht,
+                                                               const void *key);
+
+typedef struct {
+    // Allocate a memory block
+    void* (*malloc) (size_t size);
+
+    // Release a memory block
+    void (*free) (void *ptr);
+} _Py_hashtable_allocator_t;
+
+
+/* _Py_hashtable: table */
+struct _Py_hashtable_t {
+    size_t nentries; // Total number of entries in the table
+    size_t nbuckets;
+    _Py_slist_t *buckets;
+
+    _Py_hashtable_get_entry_func get_entry_func;
+    _Py_hashtable_hash_func hash_func;
+    _Py_hashtable_compare_func compare_func;
+    _Py_hashtable_destroy_func key_destroy_func;
+    _Py_hashtable_destroy_func value_destroy_func;
+    _Py_hashtable_allocator_t alloc;
+};
+
+// Export _Py_hashtable functions for '_testinternalcapi' shared extension
+PyAPI_FUNC(_Py_hashtable_t *) _Py_hashtable_new(
+    _Py_hashtable_hash_func hash_func,
+    _Py_hashtable_compare_func compare_func);
+
+/* Hash a pointer (void*) */
+PyAPI_FUNC(Py_uhash_t) _Py_hashtable_hash_ptr(const void *key);
+
+/* Comparison using memcmp() */
+PyAPI_FUNC(int) _Py_hashtable_compare_direct(
+    const void *key1,
+    const void *key2);
+
+PyAPI_FUNC(_Py_hashtable_t *) _Py_hashtable_new_full(
+    _Py_hashtable_hash_func hash_func,
+    _Py_hashtable_compare_func compare_func,
+    _Py_hashtable_destroy_func key_destroy_func,
+    _Py_hashtable_destroy_func value_destroy_func,
+    _Py_hashtable_allocator_t *allocator);
+
+PyAPI_FUNC(void) _Py_hashtable_destroy(_Py_hashtable_t *ht);
+
+PyAPI_FUNC(void) _Py_hashtable_clear(_Py_hashtable_t *ht);
+
+typedef int (*_Py_hashtable_foreach_func) (_Py_hashtable_t *ht,
+                                           const void *key, const void *value,
+                                           void *user_data);
+
+/* Call func() on each entry of the hashtable.
+   Iteration stops if func() result is non-zero, in this case it's the result
+   of the call. Otherwise, the function returns 0. */
+PyAPI_FUNC(int) _Py_hashtable_foreach(
+    _Py_hashtable_t *ht,
+    _Py_hashtable_foreach_func func,
+    void *user_data);
+
+PyAPI_FUNC(size_t) _Py_hashtable_size(const _Py_hashtable_t *ht);
+PyAPI_FUNC(size_t) _Py_hashtable_len(const _Py_hashtable_t *ht);
+
+/* Add a new entry to the hash. The key must not be present in the hash table.
+   Return 0 on success, -1 on memory error. */
+PyAPI_FUNC(int) _Py_hashtable_set(
+    _Py_hashtable_t *ht,
+    const void *key,
+    void *value);
+
+
+/* Get an entry.
+   Return NULL if the key does not exist. */
+static inline _Py_hashtable_entry_t *
+_Py_hashtable_get_entry(_Py_hashtable_t *ht, const void *key)
+{
+    return ht->get_entry_func(ht, key);
+}
+
+
+/* Get value from an entry.
+   Return NULL if the entry is not found.
+
+   Use _Py_hashtable_get_entry() to distinguish entry value equal to NULL
+   and entry not found. */
+PyAPI_FUNC(void*) _Py_hashtable_get(_Py_hashtable_t *ht, const void *key);
+
+
+/* Remove a key and its associated value without calling key and value destroy
+   functions.
+
+   Return the removed value if the key was found.
+   Return NULL if the key was not found. */
+PyAPI_FUNC(void*) _Py_hashtable_steal(
+    _Py_hashtable_t *ht,
+    const void *key);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_HASHTABLE_H */
diff --git a/Include/internal/pycore_identifier.h b/Include/internal/pycore_identifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..cda28810a48196227bcb54bf5028f18e6bb34e42
--- /dev/null
+++ b/Include/internal/pycore_identifier.h
@@ -0,0 +1,20 @@
+/* String Literals: _Py_Identifier API */
+
+#ifndef Py_INTERNAL_IDENTIFIER_H
+#define Py_INTERNAL_IDENTIFIER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern PyObject* _PyType_LookupId(PyTypeObject *, _Py_Identifier *);
+extern PyObject* _PyObject_LookupSpecialId(PyObject *, _Py_Identifier *);
+extern int _PyObject_SetAttrId(PyObject *, _Py_Identifier *, PyObject *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_IDENTIFIER_H
diff --git a/Include/internal/pycore_import.h b/Include/internal/pycore_import.h
new file mode 100644
index 0000000000000000000000000000000000000000..55029abdd31b5a479f16a048d402c44c2d7affca
--- /dev/null
+++ b/Include/internal/pycore_import.h
@@ -0,0 +1,213 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_INTERNAL_IMPORT_H
+#define Py_INTERNAL_IMPORT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_lock.h"          // PyMutex
+#include "pycore_hashtable.h"     // _Py_hashtable_t
+
+extern int _PyImport_IsInitialized(PyInterpreterState *);
+
+// Export for 'pyexpat' shared extension
+PyAPI_FUNC(int) _PyImport_SetModule(PyObject *name, PyObject *module);
+
+extern int _PyImport_SetModuleString(const char *name, PyObject* module);
+
+extern void _PyImport_AcquireLock(PyInterpreterState *interp);
+extern void _PyImport_ReleaseLock(PyInterpreterState *interp);
+extern void _PyImport_ReInitLock(PyInterpreterState *interp);
+
+// This is used exclusively for the sys and builtins modules:
+extern int _PyImport_FixupBuiltin(
+    PyThreadState *tstate,
+    PyObject *mod,
+    const char *name,            /* UTF-8 encoded string */
+    PyObject *modules
+    );
+
+// Export for many shared extensions, like '_json'
+PyAPI_FUNC(PyObject*) _PyImport_GetModuleAttr(PyObject *, PyObject *);
+
+// Export for many shared extensions, like '_datetime'
+PyAPI_FUNC(PyObject*) _PyImport_GetModuleAttrString(const char *, const char *);
+
+
+struct _import_runtime_state {
+    /* The builtin modules (defined in config.c). */
+    struct _inittab *inittab;
+    /* The most recent value assigned to a PyModuleDef.m_base.m_index.
+       This is incremented each time PyModuleDef_Init() is called,
+       which is just about every time an extension module is imported.
+       See PyInterpreterState.modules_by_index for more info. */
+    Py_ssize_t last_module_index;
+    struct {
+        /* A lock to guard the cache. */
+        PyMutex mutex;
+        /* The actual cache of (filename, name, PyModuleDef) for modules.
+           Only legacy (single-phase init) extension modules are added
+           and only if they support multiple initialization (m_size >- 0)
+           or are imported in the main interpreter.
+           This is initialized lazily in fix_up_extension() in import.c.
+           Modules are added there and looked up in _imp.find_extension(). */
+        _Py_hashtable_t *hashtable;
+    } extensions;
+    /* Package context -- the full module name for package imports */
+    const char * pkgcontext;
+};
+
+struct _import_state {
+    /* cached sys.modules dictionary */
+    PyObject *modules;
+    /* This is the list of module objects for all legacy (single-phase init)
+       extension modules ever loaded in this process (i.e. imported
+       in this interpreter or in any other).  Py_None stands in for
+       modules that haven't actually been imported in this interpreter.
+
+       A module's index (PyModuleDef.m_base.m_index) is used to look up
+       the corresponding module object for this interpreter, if any.
+       (See PyState_FindModule().)  When any extension module
+       is initialized during import, its moduledef gets initialized by
+       PyModuleDef_Init(), and the first time that happens for each
+       PyModuleDef, its index gets set to the current value of
+       a global counter (see _PyRuntimeState.imports.last_module_index).
+       The entry for that index in this interpreter remains unset until
+       the module is actually imported here.  (Py_None is used as
+       a placeholder.)  Note that multi-phase init modules always get
+       an index for which there will never be a module set.
+
+       This is initialized lazily in PyState_AddModule(), which is also
+       where modules get added. */
+    PyObject *modules_by_index;
+    /* importlib module._bootstrap */
+    PyObject *importlib;
+    /* override for config->use_frozen_modules (for tests)
+       (-1: "off", 1: "on", 0: no override) */
+    int override_frozen_modules;
+    int override_multi_interp_extensions_check;
+#ifdef HAVE_DLOPEN
+    int dlopenflags;
+#endif
+    PyObject *import_func;
+    /* The global import lock. */
+    _PyRecursiveMutex lock;
+    /* diagnostic info in PyImport_ImportModuleLevelObject() */
+    struct {
+        int import_level;
+        PyTime_t accumulated;
+        int header;
+    } find_and_load;
+};
+
+#ifdef HAVE_DLOPEN
+#  include <dlfcn.h>              // RTLD_NOW, RTLD_LAZY
+#  if HAVE_DECL_RTLD_NOW
+#    define _Py_DLOPEN_FLAGS RTLD_NOW
+#  else
+#    define _Py_DLOPEN_FLAGS RTLD_LAZY
+#  endif
+#  define DLOPENFLAGS_INIT .dlopenflags = _Py_DLOPEN_FLAGS,
+#else
+#  define _Py_DLOPEN_FLAGS 0
+#  define DLOPENFLAGS_INIT
+#endif
+
+#define IMPORTS_INIT \
+    { \
+        DLOPENFLAGS_INIT \
+        .find_and_load = { \
+            .header = 1, \
+        }, \
+    }
+
+extern void _PyImport_ClearCore(PyInterpreterState *interp);
+
+extern Py_ssize_t _PyImport_GetNextModuleIndex(void);
+extern const char * _PyImport_ResolveNameWithPackageContext(const char *name);
+extern const char * _PyImport_SwapPackageContext(const char *newcontext);
+
+extern int _PyImport_GetDLOpenFlags(PyInterpreterState *interp);
+extern void _PyImport_SetDLOpenFlags(PyInterpreterState *interp, int new_val);
+
+extern PyObject * _PyImport_InitModules(PyInterpreterState *interp);
+extern PyObject * _PyImport_GetModules(PyInterpreterState *interp);
+extern void _PyImport_ClearModules(PyInterpreterState *interp);
+
+extern void _PyImport_ClearModulesByIndex(PyInterpreterState *interp);
+
+extern int _PyImport_InitDefaultImportFunc(PyInterpreterState *interp);
+extern int _PyImport_IsDefaultImportFunc(
+        PyInterpreterState *interp,
+        PyObject *func);
+
+extern PyObject * _PyImport_GetImportlibLoader(
+        PyInterpreterState *interp,
+        const char *loader_name);
+extern PyObject * _PyImport_GetImportlibExternalLoader(
+        PyInterpreterState *interp,
+        const char *loader_name);
+extern PyObject * _PyImport_BlessMyLoader(
+        PyInterpreterState *interp,
+        PyObject *module_globals);
+extern PyObject * _PyImport_ImportlibModuleRepr(
+        PyInterpreterState *interp,
+        PyObject *module);
+
+
+extern PyStatus _PyImport_Init(void);
+extern void _PyImport_Fini(void);
+extern void _PyImport_Fini2(void);
+
+extern PyStatus _PyImport_InitCore(
+        PyThreadState *tstate,
+        PyObject *sysmod,
+        int importlib);
+extern PyStatus _PyImport_InitExternal(PyThreadState *tstate);
+extern void _PyImport_FiniCore(PyInterpreterState *interp);
+extern void _PyImport_FiniExternal(PyInterpreterState *interp);
+
+
+extern PyObject* _PyImport_GetBuiltinModuleNames(void);
+
+struct _module_alias {
+    const char *name;                 /* ASCII encoded string */
+    const char *orig;                 /* ASCII encoded string */
+};
+
+// Export these 3 symbols for test_ctypes
+PyAPI_DATA(const struct _frozen*) _PyImport_FrozenBootstrap;
+PyAPI_DATA(const struct _frozen*) _PyImport_FrozenStdlib;
+PyAPI_DATA(const struct _frozen*) _PyImport_FrozenTest;
+
+extern const struct _module_alias * _PyImport_FrozenAliases;
+
+extern int _PyImport_CheckSubinterpIncompatibleExtensionAllowed(
+    const char *name);
+
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(int) _PyImport_ClearExtension(PyObject *name, PyObject *filename);
+
+#ifdef Py_GIL_DISABLED
+// Assuming that the GIL is enabled from a call to
+// _PyEval_EnableGILTransient(), resolve the transient request depending on the
+// state of the module argument:
+// - If module is NULL or a PyModuleObject with md_gil == Py_MOD_GIL_NOT_USED,
+//   call _PyEval_DisableGIL().
+// - Otherwise, call _PyEval_EnableGILPermanent(). If the GIL was not already
+//   enabled permanently, issue a warning referencing the module's name.
+//
+// This function may raise an exception.
+extern int _PyImport_CheckGILForModule(PyObject *module, PyObject *module_name);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_IMPORT_H */
+#endif /* !Py_LIMITED_API */
diff --git a/Include/internal/pycore_importdl.h b/Include/internal/pycore_importdl.h
new file mode 100644
index 0000000000000000000000000000000000000000..525a16f6b97274668ac9b4f8b4fd90502c8f10dd
--- /dev/null
+++ b/Include/internal/pycore_importdl.h
@@ -0,0 +1,139 @@
+#ifndef Py_INTERNAL_IMPORTDL_H
+#define Py_INTERNAL_IMPORTDL_H
+
+#include "patchlevel.h"           // PY_MAJOR_VERSION
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+extern const char *_PyImport_DynLoadFiletab[];
+
+
+typedef enum ext_module_kind {
+    _Py_ext_module_kind_UNKNOWN = 0,
+    _Py_ext_module_kind_SINGLEPHASE = 1,
+    _Py_ext_module_kind_MULTIPHASE = 2,
+    _Py_ext_module_kind_INVALID = 3,
+} _Py_ext_module_kind;
+
+typedef enum ext_module_origin {
+    _Py_ext_module_origin_CORE = 1,
+    _Py_ext_module_origin_BUILTIN = 2,
+    _Py_ext_module_origin_DYNAMIC = 3,
+} _Py_ext_module_origin;
+
+/* Input for loading an extension module. */
+struct _Py_ext_module_loader_info {
+    PyObject *filename;
+#ifndef MS_WINDOWS
+    PyObject *filename_encoded;
+#endif
+    PyObject *name;
+    PyObject *name_encoded;
+    /* path is always a borrowed ref of name or filename,
+     * depending on if it's builtin or not. */
+    PyObject *path;
+    _Py_ext_module_origin origin;
+    const char *hook_prefix;
+    const char *newcontext;
+};
+extern void _Py_ext_module_loader_info_clear(
+    struct _Py_ext_module_loader_info *info);
+extern int _Py_ext_module_loader_info_init(
+    struct _Py_ext_module_loader_info *info,
+    PyObject *name,
+    PyObject *filename,
+    _Py_ext_module_origin origin);
+extern int _Py_ext_module_loader_info_init_for_core(
+    struct _Py_ext_module_loader_info *p_info,
+    PyObject *name);
+extern int _Py_ext_module_loader_info_init_for_builtin(
+    struct _Py_ext_module_loader_info *p_info,
+    PyObject *name);
+#ifdef HAVE_DYNAMIC_LOADING
+extern int _Py_ext_module_loader_info_init_from_spec(
+    struct _Py_ext_module_loader_info *info,
+    PyObject *spec);
+#endif
+
+/* The result from running an extension module's init function. */
+struct _Py_ext_module_loader_result {
+    PyModuleDef *def;
+    PyObject *module;
+    _Py_ext_module_kind kind;
+    struct _Py_ext_module_loader_result_error *err;
+    struct _Py_ext_module_loader_result_error {
+        enum _Py_ext_module_loader_result_error_kind {
+            _Py_ext_module_loader_result_EXCEPTION = 0,
+            _Py_ext_module_loader_result_ERR_MISSING = 1,
+            _Py_ext_module_loader_result_ERR_UNREPORTED_EXC = 2,
+            _Py_ext_module_loader_result_ERR_UNINITIALIZED = 3,
+            _Py_ext_module_loader_result_ERR_NONASCII_NOT_MULTIPHASE = 4,
+            _Py_ext_module_loader_result_ERR_NOT_MODULE = 5,
+            _Py_ext_module_loader_result_ERR_MISSING_DEF = 6,
+        } kind;
+        PyObject *exc;
+    } _err;
+};
+extern void _Py_ext_module_loader_result_clear(
+    struct _Py_ext_module_loader_result *res);
+extern void _Py_ext_module_loader_result_apply_error(
+    struct _Py_ext_module_loader_result *res,
+    const char *name);
+
+/* The module init function. */
+typedef PyObject *(*PyModInitFunction)(void);
+#ifdef HAVE_DYNAMIC_LOADING
+extern PyModInitFunction _PyImport_GetModInitFunc(
+    struct _Py_ext_module_loader_info *info,
+    FILE *fp);
+#endif
+extern int _PyImport_RunModInitFunc(
+    PyModInitFunction p0,
+    struct _Py_ext_module_loader_info *info,
+    struct _Py_ext_module_loader_result *p_res);
+
+
+/* Max length of module suffix searched for -- accommodates "module.slb" */
+#define MAXSUFFIXSIZE 12
+
+#ifdef MS_WINDOWS
+#include <windows.h>
+typedef FARPROC dl_funcptr;
+
+#ifdef _DEBUG
+#  define PYD_DEBUG_SUFFIX "_d"
+#else
+#  define PYD_DEBUG_SUFFIX ""
+#endif
+
+#ifdef Py_GIL_DISABLED
+#  define PYD_THREADING_TAG "t"
+#else
+#  define PYD_THREADING_TAG ""
+#endif
+
+#ifdef PYD_PLATFORM_TAG
+#  define PYD_SOABI "cp" Py_STRINGIFY(PY_MAJOR_VERSION) Py_STRINGIFY(PY_MINOR_VERSION) PYD_THREADING_TAG "-" PYD_PLATFORM_TAG
+#else
+#  define PYD_SOABI "cp" Py_STRINGIFY(PY_MAJOR_VERSION) Py_STRINGIFY(PY_MINOR_VERSION) PYD_THREADING_TAG
+#endif
+
+#define PYD_TAGGED_SUFFIX PYD_DEBUG_SUFFIX "." PYD_SOABI ".pyd"
+#define PYD_UNTAGGED_SUFFIX PYD_DEBUG_SUFFIX ".pyd"
+
+#else
+typedef void (*dl_funcptr)(void);
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_IMPORTDL_H */
diff --git a/Include/internal/pycore_initconfig.h b/Include/internal/pycore_initconfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c68161341860aa4588e7fdf576737fa01cce99b
--- /dev/null
+++ b/Include/internal/pycore_initconfig.h
@@ -0,0 +1,200 @@
+#ifndef Py_INTERNAL_CORECONFIG_H
+#define Py_INTERNAL_CORECONFIG_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* Forward declaration */
+struct pyruntimestate;
+
+/* --- PyStatus ----------------------------------------------- */
+
+/* Almost all errors causing Python initialization to fail */
+#ifdef _MSC_VER
+   /* Visual Studio 2015 doesn't implement C99 __func__ in C */
+#  define _PyStatus_GET_FUNC() __FUNCTION__
+#else
+#  define _PyStatus_GET_FUNC() __func__
+#endif
+
+#define _PyStatus_OK() \
+    (PyStatus){._type = _PyStatus_TYPE_OK}
+    /* other fields are set to 0 */
+#define _PyStatus_ERR(ERR_MSG) \
+    (PyStatus){ \
+        ._type = _PyStatus_TYPE_ERROR, \
+        .func = _PyStatus_GET_FUNC(), \
+        .err_msg = (ERR_MSG)}
+        /* other fields are set to 0 */
+#define _PyStatus_NO_MEMORY_ERRMSG "memory allocation failed"
+#define _PyStatus_NO_MEMORY() _PyStatus_ERR(_PyStatus_NO_MEMORY_ERRMSG)
+#define _PyStatus_EXIT(EXITCODE) \
+    (PyStatus){ \
+        ._type = _PyStatus_TYPE_EXIT, \
+        .exitcode = (EXITCODE)}
+#define _PyStatus_IS_ERROR(err) \
+    ((err)._type == _PyStatus_TYPE_ERROR)
+#define _PyStatus_IS_EXIT(err) \
+    ((err)._type == _PyStatus_TYPE_EXIT)
+#define _PyStatus_EXCEPTION(err) \
+    ((err)._type != _PyStatus_TYPE_OK)
+#define _PyStatus_UPDATE_FUNC(err) \
+    do { (err).func = _PyStatus_GET_FUNC(); } while (0)
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(void) _PyErr_SetFromPyStatus(PyStatus status);
+
+
+/* --- PyWideStringList ------------------------------------------------ */
+
+#define _PyWideStringList_INIT (PyWideStringList){.length = 0, .items = NULL}
+
+#ifndef NDEBUG
+extern int _PyWideStringList_CheckConsistency(const PyWideStringList *list);
+#endif
+extern void _PyWideStringList_Clear(PyWideStringList *list);
+extern int _PyWideStringList_Copy(PyWideStringList *list,
+    const PyWideStringList *list2);
+extern PyStatus _PyWideStringList_Extend(PyWideStringList *list,
+    const PyWideStringList *list2);
+extern PyObject* _PyWideStringList_AsList(const PyWideStringList *list);
+
+
+/* --- _PyArgv ---------------------------------------------------- */
+
+typedef struct _PyArgv {
+    Py_ssize_t argc;
+    int use_bytes_argv;
+    char * const *bytes_argv;
+    wchar_t * const *wchar_argv;
+} _PyArgv;
+
+extern PyStatus _PyArgv_AsWstrList(const _PyArgv *args,
+    PyWideStringList *list);
+
+
+/* --- Helper functions ------------------------------------------- */
+
+extern int _Py_str_to_int(
+    const char *str,
+    int *result);
+extern const wchar_t* _Py_get_xoption(
+    const PyWideStringList *xoptions,
+    const wchar_t *name);
+extern const char* _Py_GetEnv(
+    int use_environment,
+    const char *name);
+extern void _Py_get_env_flag(
+    int use_environment,
+    int *flag,
+    const char *name);
+
+/* Py_GetArgcArgv() helper */
+extern void _Py_ClearArgcArgv(void);
+
+
+/* --- _PyPreCmdline ------------------------------------------------- */
+
+typedef struct {
+    PyWideStringList argv;
+    PyWideStringList xoptions;     /* "-X value" option */
+    int isolated;             /* -I option */
+    int use_environment;      /* -E option */
+    int dev_mode;             /* -X dev and PYTHONDEVMODE */
+    int warn_default_encoding;     /* -X warn_default_encoding and PYTHONWARNDEFAULTENCODING */
+} _PyPreCmdline;
+
+#define _PyPreCmdline_INIT \
+    (_PyPreCmdline){ \
+        .use_environment = -1, \
+        .isolated = -1, \
+        .dev_mode = -1}
+/* Note: _PyPreCmdline_INIT sets other fields to 0/NULL */
+
+extern void _PyPreCmdline_Clear(_PyPreCmdline *cmdline);
+extern PyStatus _PyPreCmdline_SetArgv(_PyPreCmdline *cmdline,
+    const _PyArgv *args);
+extern PyStatus _PyPreCmdline_SetConfig(
+    const _PyPreCmdline *cmdline,
+    PyConfig *config);
+extern PyStatus _PyPreCmdline_Read(_PyPreCmdline *cmdline,
+    const PyPreConfig *preconfig);
+
+
+/* --- PyPreConfig ----------------------------------------------- */
+
+// Export for '_testembed' program
+PyAPI_FUNC(void) _PyPreConfig_InitCompatConfig(PyPreConfig *preconfig);
+
+extern void _PyPreConfig_InitFromConfig(
+    PyPreConfig *preconfig,
+    const PyConfig *config);
+extern PyStatus _PyPreConfig_InitFromPreConfig(
+    PyPreConfig *preconfig,
+    const PyPreConfig *config2);
+extern PyObject* _PyPreConfig_AsDict(const PyPreConfig *preconfig);
+extern void _PyPreConfig_GetConfig(PyPreConfig *preconfig,
+    const PyConfig *config);
+extern PyStatus _PyPreConfig_Read(PyPreConfig *preconfig,
+    const _PyArgv *args);
+extern PyStatus _PyPreConfig_Write(const PyPreConfig *preconfig);
+
+
+/* --- PyConfig ---------------------------------------------- */
+
+typedef enum {
+    /* Py_Initialize() API: backward compatibility with Python 3.6 and 3.7 */
+    _PyConfig_INIT_COMPAT = 1,
+    _PyConfig_INIT_PYTHON = 2,
+    _PyConfig_INIT_ISOLATED = 3
+} _PyConfigInitEnum;
+
+typedef enum {
+    /* For now, this means the GIL is enabled.
+
+       gh-116329: This will eventually change to "the GIL is disabled but can
+       be reenabled by loading an incompatible extension module." */
+    _PyConfig_GIL_DEFAULT = -1,
+
+    /* The GIL has been forced off or on, and will not be affected by module loading. */
+    _PyConfig_GIL_DISABLE = 0,
+    _PyConfig_GIL_ENABLE = 1,
+} _PyConfigGILEnum;
+
+// Export for '_testembed' program
+PyAPI_FUNC(void) _PyConfig_InitCompatConfig(PyConfig *config);
+
+extern PyStatus _PyConfig_Copy(
+    PyConfig *config,
+    const PyConfig *config2);
+extern PyStatus _PyConfig_InitPathConfig(
+    PyConfig *config,
+    int compute_path_config);
+extern PyStatus _PyConfig_InitImportConfig(PyConfig *config);
+extern PyStatus _PyConfig_Read(PyConfig *config, int compute_path_config);
+extern PyStatus _PyConfig_Write(const PyConfig *config,
+    struct pyruntimestate *runtime);
+extern PyStatus _PyConfig_SetPyArgv(
+    PyConfig *config,
+    const _PyArgv *args);
+
+
+extern void _Py_DumpPathConfig(PyThreadState *tstate);
+
+
+/* --- Function used for testing ---------------------------------- */
+
+// Export these functions for '_testinternalcapi' shared extension
+PyAPI_FUNC(PyObject*) _PyConfig_AsDict(const PyConfig *config);
+PyAPI_FUNC(int) _PyConfig_FromDict(PyConfig *config, PyObject *dict);
+PyAPI_FUNC(PyObject*) _Py_Get_Getpath_CodeObject(void);
+PyAPI_FUNC(PyObject*) _Py_GetConfigsAsDict(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CORECONFIG_H */
diff --git a/Include/internal/pycore_instruction_sequence.h b/Include/internal/pycore_instruction_sequence.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6a79616db71fa0bf65e50be31214155323374c9
--- /dev/null
+++ b/Include/internal/pycore_instruction_sequence.h
@@ -0,0 +1,73 @@
+#ifndef Py_INTERNAL_INSTRUCTION_SEQUENCE_H
+#define Py_INTERNAL_INSTRUCTION_SEQUENCE_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_symtable.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct {
+    int h_label;
+    int h_startdepth;
+    int h_preserve_lasti;
+} _PyExceptHandlerInfo;
+
+typedef struct {
+    int i_opcode;
+    int i_oparg;
+    _Py_SourceLocation i_loc;
+    _PyExceptHandlerInfo i_except_handler_info;
+
+    /* Temporary fields, used by the assembler and in instr_sequence_to_cfg */
+    int i_target;
+    int i_offset;
+} _PyInstruction;
+
+typedef struct instruction_sequence {
+    PyObject_HEAD
+    _PyInstruction *s_instrs;
+    int s_allocated;
+    int s_used;
+
+    int s_next_free_label; /* next free label id */
+
+    /* Map of a label id to instruction offset (index into s_instrs).
+     * If s_labelmap is NULL, then each label id is the offset itself.
+     */
+    int *s_labelmap;
+    int s_labelmap_size;
+
+    /* PyList of instruction sequences of nested functions */
+    PyObject *s_nested;
+} _PyInstructionSequence;
+
+typedef struct {
+    int id;
+} _PyJumpTargetLabel;
+
+PyAPI_FUNC(PyObject*)_PyInstructionSequence_New(void);
+
+int _PyInstructionSequence_UseLabel(_PyInstructionSequence *seq, int lbl);
+int _PyInstructionSequence_Addop(_PyInstructionSequence *seq,
+                                 int opcode, int oparg,
+                                 _Py_SourceLocation loc);
+_PyJumpTargetLabel _PyInstructionSequence_NewLabel(_PyInstructionSequence *seq);
+int _PyInstructionSequence_ApplyLabelMap(_PyInstructionSequence *seq);
+int _PyInstructionSequence_InsertInstruction(_PyInstructionSequence *seq, int pos,
+                                             int opcode, int oparg, _Py_SourceLocation loc);
+int _PyInstructionSequence_AddNested(_PyInstructionSequence *seq, _PyInstructionSequence *nested);
+void PyInstructionSequence_Fini(_PyInstructionSequence *seq);
+
+extern PyTypeObject _PyInstructionSequence_Type;
+#define _PyInstructionSequence_Check(v) Py_IS_TYPE((v), &_PyInstructionSequence_Type)
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_INSTRUCTION_SEQUENCE_H */
diff --git a/Include/internal/pycore_instruments.h b/Include/internal/pycore_instruments.h
new file mode 100644
index 0000000000000000000000000000000000000000..c98e82c8be5546ad82caca437113c98b1372f297
--- /dev/null
+++ b/Include/internal/pycore_instruments.h
@@ -0,0 +1,75 @@
+#ifndef Py_INTERNAL_INSTRUMENT_H
+#define Py_INTERNAL_INSTRUMENT_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_frame.h"         // _PyInterpreterFrame
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PY_MONITORING_TOOL_IDS 8
+
+typedef uint32_t _PyMonitoringEventSet;
+
+/* Tool IDs */
+
+/* These are defined in PEP 669 for convenience to avoid clashes */
+#define PY_MONITORING_DEBUGGER_ID 0
+#define PY_MONITORING_COVERAGE_ID 1
+#define PY_MONITORING_PROFILER_ID 2
+#define PY_MONITORING_OPTIMIZER_ID 5
+
+/* Internal IDs used to suuport sys.setprofile() and sys.settrace() */
+#define PY_MONITORING_SYS_PROFILE_ID 6
+#define PY_MONITORING_SYS_TRACE_ID 7
+
+
+PyObject *_PyMonitoring_RegisterCallback(int tool_id, int event_id, PyObject *obj);
+
+int _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events);
+int _PyMonitoring_SetLocalEvents(PyCodeObject *code, int tool_id, _PyMonitoringEventSet events);
+int _PyMonitoring_GetLocalEvents(PyCodeObject *code, int tool_id, _PyMonitoringEventSet *events);
+
+extern int
+_Py_call_instrumentation(PyThreadState *tstate, int event,
+    _PyInterpreterFrame *frame, _Py_CODEUNIT *instr);
+
+extern int
+_Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame,
+                              _Py_CODEUNIT *instr, _Py_CODEUNIT *prev);
+
+extern int
+_Py_call_instrumentation_instruction(
+    PyThreadState *tstate, _PyInterpreterFrame* frame, _Py_CODEUNIT *instr);
+
+_Py_CODEUNIT *
+_Py_call_instrumentation_jump(
+    PyThreadState *tstate, int event,
+    _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _Py_CODEUNIT *target);
+
+extern int
+_Py_call_instrumentation_arg(PyThreadState *tstate, int event,
+    _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, PyObject *arg);
+
+extern int
+_Py_call_instrumentation_2args(PyThreadState *tstate, int event,
+    _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, PyObject *arg0, PyObject *arg1);
+
+extern void
+_Py_call_instrumentation_exc2(PyThreadState *tstate, int event,
+    _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, PyObject *arg0, PyObject *arg1);
+
+extern int
+_Py_Instrumentation_GetLine(PyCodeObject *code, int index);
+
+extern PyObject _PyInstrumentation_MISSING;
+extern PyObject _PyInstrumentation_DISABLE;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_INSTRUMENT_H */
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
new file mode 100644
index 0000000000000000000000000000000000000000..075d35a37557777a091c9ee13aaf7f25df527941
--- /dev/null
+++ b/Include/internal/pycore_interp.h
@@ -0,0 +1,423 @@
+#ifndef Py_INTERNAL_INTERP_H
+#define Py_INTERNAL_INTERP_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <stdbool.h>              // bool
+
+#include "pycore_ast_state.h"     // struct ast_state
+#include "pycore_atexit.h"        // struct atexit_state
+#include "pycore_ceval_state.h"   // struct _ceval_state
+#include "pycore_code.h"          // struct callable_cache
+#include "pycore_codecs.h"        // struct codecs_state
+#include "pycore_context.h"       // struct _Py_context_state
+#include "pycore_crossinterp.h"   // struct _xidregistry
+#include "pycore_dict_state.h"    // struct _Py_dict_state
+#include "pycore_dtoa.h"          // struct _dtoa_state
+#include "pycore_exceptions.h"    // struct _Py_exc_state
+#include "pycore_floatobject.h"   // struct _Py_float_state
+#include "pycore_function.h"      // FUNC_MAX_WATCHERS
+#include "pycore_gc.h"            // struct _gc_runtime_state
+#include "pycore_genobject.h"     // struct _Py_async_gen_state
+#include "pycore_global_objects.h"// struct _Py_interp_cached_objects
+#include "pycore_import.h"        // struct _import_state
+#include "pycore_instruments.h"   // _PY_MONITORING_EVENTS
+#include "pycore_list.h"          // struct _Py_list_state
+#include "pycore_mimalloc.h"      // struct _mimalloc_interp_state
+#include "pycore_object_state.h"  // struct _py_object_state
+#include "pycore_optimizer.h"     // _PyOptimizerObject
+#include "pycore_obmalloc.h"      // struct _obmalloc_state
+#include "pycore_qsbr.h"          // struct _qsbr_state
+#include "pycore_tstate.h"        // _PyThreadStateImpl
+#include "pycore_tuple.h"         // struct _Py_tuple_state
+#include "pycore_typeobject.h"    // struct types_state
+#include "pycore_unicodeobject.h" // struct _Py_unicode_state
+#include "pycore_warnings.h"      // struct _warnings_runtime_state
+
+
+struct _Py_long_state {
+    int max_str_digits;
+};
+
+// Support for stop-the-world events. This exists in both the PyRuntime struct
+// for global pauses and in each PyInterpreterState for per-interpreter pauses.
+struct _stoptheworld_state {
+    PyMutex mutex;       // Serializes stop-the-world attempts.
+
+    // NOTE: The below fields are protected by HEAD_LOCK(runtime), not by the
+    // above mutex.
+    bool requested;      // Set when a pause is requested.
+    bool world_stopped;  // Set when the world is stopped.
+    bool is_global;      // Set when contained in PyRuntime struct.
+
+    PyEvent stop_event;  // Set when thread_countdown reaches zero.
+    Py_ssize_t thread_countdown;  // Number of threads that must pause.
+
+    PyThreadState *requester; // Thread that requested the pause (may be NULL).
+};
+
+#ifdef Py_GIL_DISABLED
+// This should be prime but otherwise the choice is arbitrary. A larger value
+// increases concurrency at the expense of memory.
+#  define NUM_WEAKREF_LIST_LOCKS 127
+#endif
+
+/* cross-interpreter data registry */
+
+/* Tracks some rare events per-interpreter, used by the optimizer to turn on/off
+   specific optimizations. */
+typedef struct _rare_events {
+    /* Setting an object's class, obj.__class__ = ... */
+    uint8_t set_class;
+    /* Setting the bases of a class, cls.__bases__ = ... */
+    uint8_t set_bases;
+    /* Setting the PEP 523 frame eval function, _PyInterpreterState_SetFrameEvalFunc() */
+    uint8_t set_eval_frame_func;
+    /* Modifying the builtins,  __builtins__.__dict__[var] = ... */
+    uint8_t builtin_dict;
+    /* Modifying a function, e.g. func.__defaults__ = ..., etc. */
+    uint8_t func_modification;
+} _rare_events;
+
+/* interpreter state */
+
+/* PyInterpreterState holds the global state for one of the runtime's
+   interpreters.  Typically the initial (main) interpreter is the only one.
+
+   The PyInterpreterState typedef is in Include/pytypedefs.h.
+   */
+struct _is {
+
+    /* This struct contains the eval_breaker,
+     * which is by far the hottest field in this struct
+     * and should be placed at the beginning. */
+    struct _ceval_state ceval;
+
+    PyInterpreterState *next;
+
+    int64_t id;
+    int64_t id_refcount;
+    int requires_idref;
+    PyThread_type_lock id_mutex;
+
+#define _PyInterpreterState_WHENCE_NOTSET -1
+#define _PyInterpreterState_WHENCE_UNKNOWN 0
+#define _PyInterpreterState_WHENCE_RUNTIME 1
+#define _PyInterpreterState_WHENCE_LEGACY_CAPI 2
+#define _PyInterpreterState_WHENCE_CAPI 3
+#define _PyInterpreterState_WHENCE_XI 4
+#define _PyInterpreterState_WHENCE_STDLIB 5
+#define _PyInterpreterState_WHENCE_MAX 5
+    long _whence;
+
+    /* Has been initialized to a safe state.
+
+       In order to be effective, this must be set to 0 during or right
+       after allocation. */
+    int _initialized;
+    /* Has been fully initialized via pylifecycle.c. */
+    int _ready;
+    int finalizing;
+
+    uintptr_t last_restart_version;
+    struct pythreads {
+        uint64_t next_unique_id;
+        /* The linked list of threads, newest first. */
+        PyThreadState *head;
+        /* The thread currently executing in the __main__ module, if any. */
+        PyThreadState *main;
+        /* Used in Modules/_threadmodule.c. */
+        Py_ssize_t count;
+        /* Support for runtime thread stack size tuning.
+           A value of 0 means using the platform's default stack size
+           or the size specified by the THREAD_STACK_SIZE macro. */
+        /* Used in Python/thread.c. */
+        size_t stacksize;
+    } threads;
+
+    /* Reference to the _PyRuntime global variable. This field exists
+       to not have to pass runtime in addition to tstate to a function.
+       Get runtime from tstate: tstate->interp->runtime. */
+    struct pyruntimestate *runtime;
+
+    /* Set by Py_EndInterpreter().
+
+       Use _PyInterpreterState_GetFinalizing()
+       and _PyInterpreterState_SetFinalizing()
+       to access it, don't access it directly. */
+    PyThreadState* _finalizing;
+    /* The ID of the OS thread in which we are finalizing. */
+    unsigned long _finalizing_id;
+
+    struct _gc_runtime_state gc;
+
+    /* The following fields are here to avoid allocation during init.
+       The data is exposed through PyInterpreterState pointer fields.
+       These fields should not be accessed directly outside of init.
+
+       All other PyInterpreterState pointer fields are populated when
+       needed and default to NULL.
+
+       For now there are some exceptions to that rule, which require
+       allocation during init.  These will be addressed on a case-by-case
+       basis.  Also see _PyRuntimeState regarding the various mutex fields.
+       */
+
+    // Dictionary of the sys module
+    PyObject *sysdict;
+
+    // Dictionary of the builtins module
+    PyObject *builtins;
+
+    struct _import_state imports;
+
+    /* The per-interpreter GIL, which might not be used. */
+    struct _gil_runtime_state _gil;
+
+     /* ---------- IMPORTANT ---------------------------
+     The fields above this line are declared as early as
+     possible to facilitate out-of-process observability
+     tools. */
+
+    struct codecs_state codecs;
+
+    PyConfig config;
+    unsigned long feature_flags;
+
+    PyObject *dict;  /* Stores per-interpreter state */
+
+    PyObject *sysdict_copy;
+    PyObject *builtins_copy;
+    // Initialized to _PyEval_EvalFrameDefault().
+    _PyFrameEvalFunction eval_frame;
+
+    PyFunction_WatchCallback func_watchers[FUNC_MAX_WATCHERS];
+    // One bit is set for each non-NULL entry in func_watchers
+    uint8_t active_func_watchers;
+
+    Py_ssize_t co_extra_user_count;
+    freefunc co_extra_freefuncs[MAX_CO_EXTRA_USERS];
+
+    /* cross-interpreter data and utils */
+    struct _xi_state xi;
+
+#ifdef HAVE_FORK
+    PyObject *before_forkers;
+    PyObject *after_forkers_parent;
+    PyObject *after_forkers_child;
+#endif
+
+    struct _warnings_runtime_state warnings;
+    struct atexit_state atexit;
+    struct _stoptheworld_state stoptheworld;
+    struct _qsbr_shared qsbr;
+
+#if defined(Py_GIL_DISABLED)
+    struct _mimalloc_interp_state mimalloc;
+    struct _brc_state brc;  // biased reference counting state
+    PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS];
+#endif
+
+    // Per-interpreter state for the obmalloc allocator.  For the main
+    // interpreter and for all interpreters that don't have their
+    // own obmalloc state, this points to the static structure in
+    // obmalloc.c obmalloc_state_main.  For other interpreters, it is
+    // heap allocated by _PyMem_init_obmalloc() and freed when the
+    // interpreter structure is freed.  In the case of a heap allocated
+    // obmalloc state, it is not safe to hold on to or use memory after
+    // the interpreter is freed. The obmalloc state corresponding to
+    // that allocated memory is gone.  See free_obmalloc_arenas() for
+    // more comments.
+    struct _obmalloc_state *obmalloc;
+
+    PyObject *audit_hooks;
+    PyType_WatchCallback type_watchers[TYPE_MAX_WATCHERS];
+    PyCode_WatchCallback code_watchers[CODE_MAX_WATCHERS];
+    // One bit is set for each non-NULL entry in code_watchers
+    uint8_t active_code_watchers;
+
+    struct _py_object_state object_state;
+    struct _Py_unicode_state unicode;
+    struct _Py_long_state long_state;
+    struct _dtoa_state dtoa;
+    struct _py_func_state func_state;
+    struct _py_code_state code_state;
+
+    struct _Py_dict_state dict_state;
+    struct _Py_exc_state exc_state;
+    struct _Py_mem_interp_free_queue mem_free_queue;
+
+    struct ast_state ast;
+    struct types_state types;
+    struct callable_cache callable_cache;
+    _PyOptimizerObject *optimizer;
+    _PyExecutorObject *executor_list_head;
+
+    _rare_events rare_events;
+    PyDict_WatchCallback builtins_dict_watcher;
+
+    _Py_GlobalMonitors monitors;
+    bool sys_profile_initialized;
+    bool sys_trace_initialized;
+    Py_ssize_t sys_profiling_threads; /* Count of threads with c_profilefunc set */
+    Py_ssize_t sys_tracing_threads; /* Count of threads with c_tracefunc set */
+    PyObject *monitoring_callables[PY_MONITORING_TOOL_IDS][_PY_MONITORING_EVENTS];
+    PyObject *monitoring_tool_names[PY_MONITORING_TOOL_IDS];
+
+    struct _Py_interp_cached_objects cached_objects;
+    struct _Py_interp_static_objects static_objects;
+
+    /* the initial PyInterpreterState.threads.head */
+    _PyThreadStateImpl _initial_thread;
+    Py_ssize_t _interactive_src_count;
+    // In 3.14+ this is interp->threads.preallocated.
+    _PyThreadStateImpl *threads_preallocated;
+};
+
+
+/* other API */
+
+extern void _PyInterpreterState_Clear(PyThreadState *tstate);
+
+
+static inline PyThreadState*
+_PyInterpreterState_GetFinalizing(PyInterpreterState *interp) {
+    return (PyThreadState*)_Py_atomic_load_ptr_relaxed(&interp->_finalizing);
+}
+
+static inline unsigned long
+_PyInterpreterState_GetFinalizingID(PyInterpreterState *interp) {
+    return _Py_atomic_load_ulong_relaxed(&interp->_finalizing_id);
+}
+
+static inline void
+_PyInterpreterState_SetFinalizing(PyInterpreterState *interp, PyThreadState *tstate) {
+    _Py_atomic_store_ptr_relaxed(&interp->_finalizing, tstate);
+    if (tstate == NULL) {
+        _Py_atomic_store_ulong_relaxed(&interp->_finalizing_id, 0);
+    }
+    else {
+        // XXX Re-enable this assert once gh-109860 is fixed.
+        //assert(tstate->thread_id == PyThread_get_thread_ident());
+        _Py_atomic_store_ulong_relaxed(&interp->_finalizing_id,
+                                       tstate->thread_id);
+    }
+}
+
+
+
+// Exports for the _testinternalcapi module.
+PyAPI_FUNC(int64_t) _PyInterpreterState_ObjectToID(PyObject *);
+PyAPI_FUNC(PyInterpreterState *) _PyInterpreterState_LookUpID(int64_t);
+PyAPI_FUNC(PyInterpreterState *) _PyInterpreterState_LookUpIDObject(PyObject *);
+PyAPI_FUNC(int) _PyInterpreterState_IDInitref(PyInterpreterState *);
+PyAPI_FUNC(int) _PyInterpreterState_IDIncref(PyInterpreterState *);
+PyAPI_FUNC(void) _PyInterpreterState_IDDecref(PyInterpreterState *);
+
+PyAPI_FUNC(int) _PyInterpreterState_IsReady(PyInterpreterState *interp);
+
+PyAPI_FUNC(long) _PyInterpreterState_GetWhence(PyInterpreterState *interp);
+extern void _PyInterpreterState_SetWhence(
+    PyInterpreterState *interp,
+    long whence);
+
+extern const PyConfig* _PyInterpreterState_GetConfig(PyInterpreterState *interp);
+
+// Get a copy of the current interpreter configuration.
+//
+// Return 0 on success. Raise an exception and return -1 on error.
+//
+// The caller must initialize 'config', using PyConfig_InitPythonConfig()
+// for example.
+//
+// Python must be preinitialized to call this method.
+// The caller must hold the GIL.
+//
+// Once done with the configuration, PyConfig_Clear() must be called to clear
+// it.
+//
+// Export for '_testinternalcapi' shared extension.
+PyAPI_FUNC(int) _PyInterpreterState_GetConfigCopy(
+    struct PyConfig *config);
+
+// Set the configuration of the current interpreter.
+//
+// This function should be called during or just after the Python
+// initialization.
+//
+// Update the sys module with the new configuration. If the sys module was
+// modified directly after the Python initialization, these changes are lost.
+//
+// Some configuration like faulthandler or warnoptions can be updated in the
+// configuration, but don't reconfigure Python (don't enable/disable
+// faulthandler and don't reconfigure warnings filters).
+//
+// Return 0 on success. Raise an exception and return -1 on error.
+//
+// The configuration should come from _PyInterpreterState_GetConfigCopy().
+//
+// Export for '_testinternalcapi' shared extension.
+PyAPI_FUNC(int) _PyInterpreterState_SetConfig(
+    const struct PyConfig *config);
+
+
+/*
+Runtime Feature Flags
+
+Each flag indicate whether or not a specific runtime feature
+is available in a given context.  For example, forking the process
+might not be allowed in the current interpreter (i.e. os.fork() would fail).
+*/
+
+/* Set if the interpreter share obmalloc runtime state
+   with the main interpreter. */
+#define Py_RTFLAGS_USE_MAIN_OBMALLOC (1UL << 5)
+
+/* Set if import should check a module for subinterpreter support. */
+#define Py_RTFLAGS_MULTI_INTERP_EXTENSIONS (1UL << 8)
+
+/* Set if threads are allowed. */
+#define Py_RTFLAGS_THREADS (1UL << 10)
+
+/* Set if daemon threads are allowed. */
+#define Py_RTFLAGS_DAEMON_THREADS (1UL << 11)
+
+/* Set if os.fork() is allowed. */
+#define Py_RTFLAGS_FORK (1UL << 15)
+
+/* Set if os.exec*() is allowed. */
+#define Py_RTFLAGS_EXEC (1UL << 16)
+
+extern int _PyInterpreterState_HasFeature(PyInterpreterState *interp,
+                                          unsigned long feature);
+
+PyAPI_FUNC(PyStatus) _PyInterpreterState_New(
+    PyThreadState *tstate,
+    PyInterpreterState **pinterp);
+
+
+#define RARE_EVENT_INTERP_INC(interp, name) \
+    do { \
+        /* saturating add */ \
+        int val = FT_ATOMIC_LOAD_UINT8_RELAXED(interp->rare_events.name); \
+        if (val < UINT8_MAX) { \
+            FT_ATOMIC_STORE_UINT8(interp->rare_events.name, val + 1); \
+        } \
+        RARE_EVENT_STAT_INC(name); \
+    } while (0); \
+
+#define RARE_EVENT_INC(name) \
+    do { \
+        PyInterpreterState *interp = PyInterpreterState_Get(); \
+        RARE_EVENT_INTERP_INC(interp, name); \
+    } while (0); \
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_INTERP_H */
diff --git a/Include/internal/pycore_intrinsics.h b/Include/internal/pycore_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..39c2a30f6e979de3e4521393c53b198a07a585ab
--- /dev/null
+++ b/Include/internal/pycore_intrinsics.h
@@ -0,0 +1,51 @@
+#ifndef Py_INTERNAL_INTRINSIC_H
+#define Py_INTERNAL_INTRINSIC_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* Unary Functions: */
+#define INTRINSIC_1_INVALID                      0
+#define INTRINSIC_PRINT                          1
+#define INTRINSIC_IMPORT_STAR                    2
+#define INTRINSIC_STOPITERATION_ERROR            3
+#define INTRINSIC_ASYNC_GEN_WRAP                 4
+#define INTRINSIC_UNARY_POSITIVE                 5
+#define INTRINSIC_LIST_TO_TUPLE                  6
+#define INTRINSIC_TYPEVAR                        7
+#define INTRINSIC_PARAMSPEC                      8
+#define INTRINSIC_TYPEVARTUPLE                   9
+#define INTRINSIC_SUBSCRIPT_GENERIC             10
+#define INTRINSIC_TYPEALIAS                     11
+
+#define MAX_INTRINSIC_1                         11
+
+
+/* Binary Functions: */
+#define INTRINSIC_2_INVALID                      0
+#define INTRINSIC_PREP_RERAISE_STAR              1
+#define INTRINSIC_TYPEVAR_WITH_BOUND             2
+#define INTRINSIC_TYPEVAR_WITH_CONSTRAINTS       3
+#define INTRINSIC_SET_FUNCTION_TYPE_PARAMS       4
+#define INTRINSIC_SET_TYPEPARAM_DEFAULT          5
+
+#define MAX_INTRINSIC_2                          5
+
+typedef PyObject *(*intrinsic_func1)(PyThreadState* tstate, PyObject *value);
+typedef PyObject *(*intrinsic_func2)(PyThreadState* tstate, PyObject *value1, PyObject *value2);
+
+typedef struct {
+    intrinsic_func1 func;
+    const char *name;
+} intrinsic_func1_info;
+
+typedef struct {
+    intrinsic_func2 func;
+    const char *name;
+} intrinsic_func2_info;
+
+PyAPI_DATA(const intrinsic_func1_info) _PyIntrinsics_UnaryFunctions[];
+PyAPI_DATA(const intrinsic_func2_info) _PyIntrinsics_BinaryFunctions[];
+
+#endif  // !Py_INTERNAL_INTRINSIC_H
diff --git a/Include/internal/pycore_jit.h b/Include/internal/pycore_jit.h
new file mode 100644
index 0000000000000000000000000000000000000000..17bd23f0752be20f2600a434fd33b36062969839
--- /dev/null
+++ b/Include/internal/pycore_jit.h
@@ -0,0 +1,25 @@
+#ifndef Py_INTERNAL_JIT_H
+#define Py_INTERNAL_JIT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef _Py_JIT
+
+typedef _Py_CODEUNIT *(*jit_func)(_PyInterpreterFrame *frame, PyObject **stack_pointer, PyThreadState *tstate);
+
+int _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size_t length);
+void _PyJIT_Free(_PyExecutorObject *executor);
+
+#endif  // _Py_JIT
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // !Py_INTERNAL_JIT_H
diff --git a/Include/internal/pycore_list.h b/Include/internal/pycore_list.h
new file mode 100644
index 0000000000000000000000000000000000000000..73695d10e0c37216b44e5e209e537a4a4dcaa8b3
--- /dev/null
+++ b/Include/internal/pycore_list.h
@@ -0,0 +1,66 @@
+#ifndef Py_INTERNAL_LIST_H
+#define Py_INTERNAL_LIST_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_freelist.h"  // _PyFreeListState
+
+PyAPI_FUNC(PyObject*) _PyList_Extend(PyListObject *, PyObject *);
+extern void _PyList_DebugMallocStats(FILE *out);
+
+#define _PyList_ITEMS(op) _Py_RVALUE(_PyList_CAST(op)->ob_item)
+
+PyAPI_FUNC(int)
+_PyList_AppendTakeRefListResize(PyListObject *self, PyObject *newitem);
+
+// In free-threaded build: self should be locked by the caller, if it should be thread-safe.
+static inline int
+_PyList_AppendTakeRef(PyListObject *self, PyObject *newitem)
+{
+    assert(self != NULL && newitem != NULL);
+    assert(PyList_Check(self));
+    Py_ssize_t len = Py_SIZE(self);
+    Py_ssize_t allocated = self->allocated;
+    assert((size_t)len + 1 < PY_SSIZE_T_MAX);
+    if (allocated > len) {
+#ifdef Py_GIL_DISABLED
+        _Py_atomic_store_ptr_release(&self->ob_item[len], newitem);
+#else
+        PyList_SET_ITEM(self, len, newitem);
+#endif
+        Py_SET_SIZE(self, len + 1);
+        return 0;
+    }
+    return _PyList_AppendTakeRefListResize(self, newitem);
+}
+
+// Repeat the bytes of a buffer in place
+static inline void
+_Py_memory_repeat(char* dest, Py_ssize_t len_dest, Py_ssize_t len_src)
+{
+    assert(len_src > 0);
+    Py_ssize_t copied = len_src;
+    while (copied < len_dest) {
+        Py_ssize_t bytes_to_copy = Py_MIN(copied, len_dest - copied);
+        memcpy(dest + copied, dest, bytes_to_copy);
+        copied += bytes_to_copy;
+    }
+}
+
+typedef struct {
+    PyObject_HEAD
+    Py_ssize_t it_index;
+    PyListObject *it_seq; /* Set to NULL when iterator is exhausted */
+} _PyListIterObject;
+
+PyAPI_FUNC(PyObject *)_PyList_FromArraySteal(PyObject *const *src, Py_ssize_t n);
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_LIST_H */
diff --git a/Include/internal/pycore_llist.h b/Include/internal/pycore_llist.h
new file mode 100644
index 0000000000000000000000000000000000000000..f629902fda9ff181f434b7b8e8df39fd71aa50ef
--- /dev/null
+++ b/Include/internal/pycore_llist.h
@@ -0,0 +1,106 @@
+// A doubly-linked list that can be embedded in a struct.
+//
+// Usage:
+//  struct llist_node head = LLIST_INIT(head);
+//  typedef struct {
+//      ...
+//      struct llist_node node;
+//      ...
+//  } MyObj;
+//
+//  llist_insert_tail(&head, &obj->node);
+//  llist_remove(&obj->node);
+//
+//  struct llist_node *node;
+//  llist_for_each(node, &head) {
+//      MyObj *obj = llist_data(node, MyObj, node);
+//      ...
+//  }
+//
+
+#ifndef Py_INTERNAL_LLIST_H
+#define Py_INTERNAL_LLIST_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "Py_BUILD_CORE must be defined to include this header"
+#endif
+
+struct llist_node {
+    struct llist_node *next;
+    struct llist_node *prev;
+};
+
+// Get the struct containing a node.
+#define llist_data(node, type, member) (_Py_CONTAINER_OF(node, type, member))
+
+// Iterate over a list.
+#define llist_for_each(node, head) \
+    for (node = (head)->next; node != (head); node = node->next)
+
+// Iterate over a list, but allow removal of the current node.
+#define llist_for_each_safe(node, head) \
+    for (struct llist_node *_next = (node = (head)->next, node->next); \
+         node != (head); node = _next, _next = node->next)
+
+#define LLIST_INIT(head) { &head, &head }
+
+static inline void
+llist_init(struct llist_node *head)
+{
+    head->next = head;
+    head->prev = head;
+}
+
+// Returns 1 if the list is empty, 0 otherwise.
+static inline int
+llist_empty(struct llist_node *head)
+{
+    return head->next == head;
+}
+
+// Appends to the tail of the list.
+static inline void
+llist_insert_tail(struct llist_node *head, struct llist_node *node)
+{
+    node->prev = head->prev;
+    node->next = head;
+    head->prev->next = node;
+    head->prev = node;
+}
+
+// Remove a node from the list.
+static inline void
+llist_remove(struct llist_node *node)
+{
+    struct llist_node *prev = node->prev;
+    struct llist_node *next = node->next;
+    prev->next = next;
+    next->prev = prev;
+    node->prev = NULL;
+    node->next = NULL;
+}
+
+// Append all nodes from head2 onto head1. head2 is left empty.
+static inline void
+llist_concat(struct llist_node *head1, struct llist_node *head2)
+{
+    if (!llist_empty(head2)) {
+        head1->prev->next = head2->next;
+        head2->next->prev = head1->prev;
+
+        head1->prev = head2->prev;
+        head2->prev->next = head1;
+        llist_init(head2);
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_LLIST_H */
diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a18bb7644725fcb3cf8391af11a6e8e7e919b36
--- /dev/null
+++ b/Include/internal/pycore_lock.h
@@ -0,0 +1,241 @@
+// Lightweight locks and other synchronization mechanisms.
+//
+// These implementations are based on WebKit's WTF::Lock. See
+// https://webkit.org/blog/6161/locking-in-webkit/ for a description of the
+// design.
+#ifndef Py_INTERNAL_LOCK_H
+#define Py_INTERNAL_LOCK_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+//_Py_UNLOCKED is defined as 0 and _Py_LOCKED as 1 in Include/cpython/lock.h
+#define _Py_HAS_PARKED  2
+#define _Py_ONCE_INITIALIZED 4
+
+static inline int
+PyMutex_LockFast(uint8_t *lock_bits)
+{
+    uint8_t expected = _Py_UNLOCKED;
+    return _Py_atomic_compare_exchange_uint8(lock_bits, &expected, _Py_LOCKED);
+}
+
+// Checks if the mutex is currently locked.
+static inline int
+PyMutex_IsLocked(PyMutex *m)
+{
+    return (_Py_atomic_load_uint8(&m->_bits) & _Py_LOCKED) != 0;
+}
+
+// Re-initializes the mutex after a fork to the unlocked state.
+static inline void
+_PyMutex_at_fork_reinit(PyMutex *m)
+{
+    memset(m, 0, sizeof(*m));
+}
+
+typedef enum _PyLockFlags {
+    // Do not detach/release the GIL when waiting on the lock.
+    _Py_LOCK_DONT_DETACH = 0,
+
+    // Detach/release the GIL while waiting on the lock.
+    _PY_LOCK_DETACH = 1,
+
+    // Handle signals if interrupted while waiting on the lock.
+    _PY_LOCK_HANDLE_SIGNALS = 2,
+} _PyLockFlags;
+
+// Lock a mutex with an optional timeout and additional options. See
+// _PyLockFlags for details.
+extern PyLockStatus
+_PyMutex_LockTimed(PyMutex *m, PyTime_t timeout_ns, _PyLockFlags flags);
+
+// Lock a mutex with aditional options. See _PyLockFlags for details.
+static inline void
+PyMutex_LockFlags(PyMutex *m, _PyLockFlags flags)
+{
+    uint8_t expected = _Py_UNLOCKED;
+    if (!_Py_atomic_compare_exchange_uint8(&m->_bits, &expected, _Py_LOCKED)) {
+        _PyMutex_LockTimed(m, -1, flags);
+    }
+}
+
+// Unlock a mutex, returns 0 if the mutex is not locked (used for improved
+// error messages).
+extern int _PyMutex_TryUnlock(PyMutex *m);
+
+
+// PyEvent is a one-time event notification
+typedef struct {
+    uint8_t v;
+} PyEvent;
+
+// Check if the event is set without blocking. Returns 1 if the event is set or
+// 0 otherwise.
+PyAPI_FUNC(int) _PyEvent_IsSet(PyEvent *evt);
+
+// Set the event and notify any waiting threads.
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(void) _PyEvent_Notify(PyEvent *evt);
+
+// Wait for the event to be set. If the event is already set, then this returns
+// immediately.
+PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt);
+
+// Wait for the event to be set, or until the timeout expires. If the event is
+// already set, then this returns immediately. Returns 1 if the event was set,
+// and 0 if the timeout expired or thread was interrupted. If `detach` is
+// true, then the thread will detach/release the GIL while waiting.
+PyAPI_FUNC(int)
+PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns, int detach);
+
+// _PyRawMutex implements a word-sized mutex that that does not depend on the
+// parking lot API, and therefore can be used in the parking lot
+// implementation.
+//
+// The mutex uses a packed representation: the least significant bit is used to
+// indicate whether the mutex is locked or not. The remaining bits are either
+// zero or a pointer to a `struct raw_mutex_entry` (see lock.c).
+typedef struct {
+    uintptr_t v;
+} _PyRawMutex;
+
+// Slow paths for lock/unlock
+extern void _PyRawMutex_LockSlow(_PyRawMutex *m);
+extern void _PyRawMutex_UnlockSlow(_PyRawMutex *m);
+
+static inline void
+_PyRawMutex_Lock(_PyRawMutex *m)
+{
+    uintptr_t unlocked = _Py_UNLOCKED;
+    if (_Py_atomic_compare_exchange_uintptr(&m->v, &unlocked, _Py_LOCKED)) {
+        return;
+    }
+    _PyRawMutex_LockSlow(m);
+}
+
+static inline void
+_PyRawMutex_Unlock(_PyRawMutex *m)
+{
+    uintptr_t locked = _Py_LOCKED;
+    if (_Py_atomic_compare_exchange_uintptr(&m->v, &locked, _Py_UNLOCKED)) {
+        return;
+    }
+    _PyRawMutex_UnlockSlow(m);
+}
+
+// Type signature for one-time initialization functions. The function should
+// return 0 on success and -1 on failure.
+typedef int _Py_once_fn_t(void *arg);
+
+// (private) slow path for one time initialization
+PyAPI_FUNC(int)
+_PyOnceFlag_CallOnceSlow(_PyOnceFlag *flag, _Py_once_fn_t *fn, void *arg);
+
+// Calls `fn` once using `flag`. The `arg` is passed to the call to `fn`.
+//
+// Returns 0 on success and -1 on failure.
+//
+// If `fn` returns 0 (success), then subsequent calls immediately return 0.
+// If `fn` returns -1 (failure), then subsequent calls will retry the call.
+static inline int
+_PyOnceFlag_CallOnce(_PyOnceFlag *flag, _Py_once_fn_t *fn, void *arg)
+{
+    if (_Py_atomic_load_uint8(&flag->v) == _Py_ONCE_INITIALIZED) {
+        return 0;
+    }
+    return _PyOnceFlag_CallOnceSlow(flag, fn, arg);
+}
+
+// A recursive mutex. The mutex should zero-initialized.
+typedef struct {
+    PyMutex mutex;
+    unsigned long long thread;  // i.e., PyThread_get_thread_ident_ex()
+    size_t level;
+} _PyRecursiveMutex;
+
+PyAPI_FUNC(int) _PyRecursiveMutex_IsLockedByCurrentThread(_PyRecursiveMutex *m);
+PyAPI_FUNC(void) _PyRecursiveMutex_Lock(_PyRecursiveMutex *m);
+PyAPI_FUNC(void) _PyRecursiveMutex_Unlock(_PyRecursiveMutex *m);
+
+
+// A readers-writer (RW) lock. The lock supports multiple concurrent readers or
+// a single writer. The lock is write-preferring: if a writer is waiting while
+// the lock is read-locked then, new readers will be blocked. This avoids
+// starvation of writers.
+//
+// In C++, the equivalent synchronization primitive is std::shared_mutex
+// with shared ("read") and exclusive ("write") locking.
+//
+// The two least significant bits are used to indicate if the lock is
+// write-locked and if there are parked threads (either readers or writers)
+// waiting to acquire the lock. The remaining bits are used to indicate the
+// number of readers holding the lock.
+//
+// 0b000..00000: unlocked
+// 0bnnn..nnn00: nnn..nnn readers holding the lock
+// 0bnnn..nnn10: nnn..nnn readers holding the lock and a writer is waiting
+// 0b00000..010: unlocked with awoken writer about to acquire lock
+// 0b00000..001: write-locked
+// 0b00000..011: write-locked and readers or other writers are waiting
+//
+// Note that reader_count must be zero if the lock is held by a writer, and
+// vice versa. The lock can only be held by readers or a writer, but not both.
+//
+// The design is optimized for simplicity of the implementation. The lock is
+// not fair: if fairness is desired, use an additional PyMutex to serialize
+// writers. The lock is also not reentrant.
+typedef struct {
+    uintptr_t bits;
+} _PyRWMutex;
+
+// Read lock (i.e., shared lock)
+PyAPI_FUNC(void) _PyRWMutex_RLock(_PyRWMutex *rwmutex);
+PyAPI_FUNC(void) _PyRWMutex_RUnlock(_PyRWMutex *rwmutex);
+
+// Write lock (i.e., exclusive lock)
+PyAPI_FUNC(void) _PyRWMutex_Lock(_PyRWMutex *rwmutex);
+PyAPI_FUNC(void) _PyRWMutex_Unlock(_PyRWMutex *rwmutex);
+
+// Similar to linux seqlock: https://en.wikipedia.org/wiki/Seqlock
+// We use a sequence number to lock the writer, an even sequence means we're unlocked, an odd
+// sequence means we're locked.  Readers will read the sequence before attempting to read the
+// underlying data and then read the sequence number again after reading the data.  If the
+// sequence has not changed the data is valid.
+//
+// Differs a little bit in that we use CAS on sequence as the lock, instead of a separate spin lock.
+// The writer can also detect that the undelering data has not changed and abandon the write
+// and restore the previous sequence.
+typedef struct {
+    uint32_t sequence;
+} _PySeqLock;
+
+// Lock the sequence lock for the writer
+PyAPI_FUNC(void) _PySeqLock_LockWrite(_PySeqLock *seqlock);
+
+// Unlock the sequence lock and move to the next sequence number.
+PyAPI_FUNC(void) _PySeqLock_UnlockWrite(_PySeqLock *seqlock);
+
+// Abandon the current update indicating that no mutations have occurred
+// and restore the previous sequence value.
+PyAPI_FUNC(void) _PySeqLock_AbandonWrite(_PySeqLock *seqlock);
+
+// Begin a read operation and return the current sequence number.
+PyAPI_FUNC(uint32_t) _PySeqLock_BeginRead(_PySeqLock *seqlock);
+
+// End the read operation and confirm that the sequence number has not changed.
+// Returns 1 if the read was successful or 0 if the read should be retried.
+PyAPI_FUNC(int) _PySeqLock_EndRead(_PySeqLock *seqlock, uint32_t previous);
+
+// Check if the lock was held during a fork and clear the lock.  Returns 1
+// if the lock was held and any associated data should be cleared.
+PyAPI_FUNC(int) _PySeqLock_AfterFork(_PySeqLock *seqlock);
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_LOCK_H */
diff --git a/Include/internal/pycore_long.h b/Include/internal/pycore_long.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff7d9afc03a4f24ea4602560b49c68fc7b2273cb
--- /dev/null
+++ b/Include/internal/pycore_long.h
@@ -0,0 +1,310 @@
+#ifndef Py_INTERNAL_LONG_H
+#define Py_INTERNAL_LONG_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_bytesobject.h"   // _PyBytesWriter
+#include "pycore_global_objects.h"// _PY_NSMALLNEGINTS
+#include "pycore_runtime.h"       // _PyRuntime
+
+/*
+ * Default int base conversion size limitation: Denial of Service prevention.
+ *
+ * Chosen such that this isn't wildly slow on modern hardware and so that
+ * everyone's existing deployed numpy test suite passes before
+ * https://github.com/numpy/numpy/issues/22098 is widely available.
+ *
+ * $ python -m timeit -s 's = "1"*4300' 'int(s)'
+ * 2000 loops, best of 5: 125 usec per loop
+ * $ python -m timeit -s 's = "1"*4300; v = int(s)' 'str(v)'
+ * 1000 loops, best of 5: 311 usec per loop
+ * (zen2 cloud VM)
+ *
+ * 4300 decimal digits fits a ~14284 bit number.
+ */
+#define _PY_LONG_DEFAULT_MAX_STR_DIGITS 4300
+/*
+ * Threshold for max digits check.  For performance reasons int() and
+ * int.__str__() don't checks values that are smaller than this
+ * threshold.  Acts as a guaranteed minimum size limit for bignums that
+ * applications can expect from CPython.
+ *
+ * % python -m timeit -s 's = "1"*640; v = int(s)' 'str(int(s))'
+ * 20000 loops, best of 5: 12 usec per loop
+ *
+ * "640 digits should be enough for anyone." - gps
+ * fits a ~2126 bit decimal number.
+ */
+#define _PY_LONG_MAX_STR_DIGITS_THRESHOLD 640
+
+#if ((_PY_LONG_DEFAULT_MAX_STR_DIGITS != 0) && \
+   (_PY_LONG_DEFAULT_MAX_STR_DIGITS < _PY_LONG_MAX_STR_DIGITS_THRESHOLD))
+# error "_PY_LONG_DEFAULT_MAX_STR_DIGITS smaller than threshold."
+#endif
+
+/* runtime lifecycle */
+
+extern PyStatus _PyLong_InitTypes(PyInterpreterState *);
+extern void _PyLong_FiniTypes(PyInterpreterState *interp);
+
+
+/* other API */
+
+#define _PyLong_SMALL_INTS _Py_SINGLETON(small_ints)
+
+// _PyLong_GetZero() and _PyLong_GetOne() must always be available
+// _PyLong_FromUnsignedChar must always be available
+#if _PY_NSMALLPOSINTS < 257
+#  error "_PY_NSMALLPOSINTS must be greater than or equal to 257"
+#endif
+
+// Return a reference to the immortal zero singleton.
+// The function cannot return NULL.
+static inline PyObject* _PyLong_GetZero(void)
+{ return (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS]; }
+
+// Return a reference to the immortal one singleton.
+// The function cannot return NULL.
+static inline PyObject* _PyLong_GetOne(void)
+{ return (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS+1]; }
+
+static inline PyObject* _PyLong_FromUnsignedChar(unsigned char i)
+{
+    return (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS+i];
+}
+
+// _PyLong_Frexp returns a double x and an exponent e such that the
+// true value is approximately equal to x * 2**e.  e is >= 0.  x is
+// 0.0 if and only if the input is 0 (in which case, e and x are both
+// zeroes); otherwise, 0.5 <= abs(x) < 1.0.  On overflow, which is
+// possible if the number of bits doesn't fit into a Py_ssize_t, sets
+// OverflowError and returns -1.0 for x, 0 for e.
+//
+// Export for 'math' shared extension
+PyAPI_DATA(double) _PyLong_Frexp(PyLongObject *a, Py_ssize_t *e);
+
+extern PyObject* _PyLong_FromBytes(const char *, Py_ssize_t, int);
+
+// _PyLong_DivmodNear.  Given integers a and b, compute the nearest
+// integer q to the exact quotient a / b, rounding to the nearest even integer
+// in the case of a tie.  Return (q, r), where r = a - q*b.  The remainder r
+// will satisfy abs(r) <= abs(b)/2, with equality possible only if q is
+// even.
+//
+// Export for '_datetime' shared extension.
+PyAPI_DATA(PyObject*) _PyLong_DivmodNear(PyObject *, PyObject *);
+
+// _PyLong_Format: Convert the long to a string object with given base,
+// appending a base prefix of 0[box] if base is 2, 8 or 16.
+// Export for '_tkinter' shared extension.
+PyAPI_DATA(PyObject*) _PyLong_Format(PyObject *obj, int base);
+
+// Export for 'math' shared extension
+PyAPI_DATA(PyObject*) _PyLong_Rshift(PyObject *, size_t);
+
+// Export for 'math' shared extension
+PyAPI_DATA(PyObject*) _PyLong_Lshift(PyObject *, size_t);
+
+PyAPI_FUNC(PyObject*) _PyLong_Add(PyLongObject *left, PyLongObject *right);
+PyAPI_FUNC(PyObject*) _PyLong_Multiply(PyLongObject *left, PyLongObject *right);
+PyAPI_FUNC(PyObject*) _PyLong_Subtract(PyLongObject *left, PyLongObject *right);
+
+// Export for 'binascii' shared extension.
+PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
+
+/* Format the object based on the format_spec, as defined in PEP 3101
+   (Advanced String Formatting). */
+extern int _PyLong_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
+
+extern int _PyLong_FormatWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    int base,
+    int alternate);
+
+extern char* _PyLong_FormatBytesWriter(
+    _PyBytesWriter *writer,
+    char *str,
+    PyObject *obj,
+    int base,
+    int alternate);
+
+// Argument converters used by Argument Clinic
+
+// Export for 'select' shared extension (Argument Clinic code)
+PyAPI_FUNC(int) _PyLong_UnsignedShort_Converter(PyObject *, void *);
+
+// Export for '_testclinic' shared extension (Argument Clinic code)
+PyAPI_FUNC(int) _PyLong_UnsignedInt_Converter(PyObject *, void *);
+
+// Export for '_blake2' shared extension (Argument Clinic code)
+PyAPI_FUNC(int) _PyLong_UnsignedLong_Converter(PyObject *, void *);
+
+// Export for '_blake2' shared extension (Argument Clinic code)
+PyAPI_FUNC(int) _PyLong_UnsignedLongLong_Converter(PyObject *, void *);
+
+// Export for '_testclinic' shared extension (Argument Clinic code)
+PyAPI_FUNC(int) _PyLong_Size_t_Converter(PyObject *, void *);
+
+/* Long value tag bits:
+ * 0-1: Sign bits value = (1-sign), ie. negative=2, positive=0, zero=1.
+ * 2: Reserved for immortality bit
+ * 3+ Unsigned digit count
+ */
+#define SIGN_MASK 3
+#define SIGN_ZERO 1
+#define SIGN_NEGATIVE 2
+#define NON_SIZE_BITS 3
+
+/* The functions _PyLong_IsCompact and _PyLong_CompactValue are defined
+ * in Include/cpython/longobject.h, since they need to be inline.
+ *
+ * "Compact" values have at least one bit to spare,
+ * so that addition and subtraction can be performed on the values
+ * without risk of overflow.
+ *
+ * The inline functions need tag bits.
+ * For readability, rather than do `#define SIGN_MASK _PyLong_SIGN_MASK`
+ * we define them to the numbers in both places and then assert that
+ * they're the same.
+ */
+#if SIGN_MASK != _PyLong_SIGN_MASK
+#  error "SIGN_MASK does not match _PyLong_SIGN_MASK"
+#endif
+#if NON_SIZE_BITS != _PyLong_NON_SIZE_BITS
+#  error "NON_SIZE_BITS does not match _PyLong_NON_SIZE_BITS"
+#endif
+
+/* All *compact" values are guaranteed to fit into
+ * a Py_ssize_t with at least one bit to spare.
+ * In other words, for 64 bit machines, compact
+ * will be signed 63 (or fewer) bit values
+ */
+
+/* Return 1 if the argument is compact int */
+static inline int
+_PyLong_IsNonNegativeCompact(const PyLongObject* op) {
+    assert(PyLong_Check(op));
+    return op->long_value.lv_tag <= (1 << NON_SIZE_BITS);
+}
+
+
+static inline int
+_PyLong_BothAreCompact(const PyLongObject* a, const PyLongObject* b) {
+    assert(PyLong_Check(a));
+    assert(PyLong_Check(b));
+    return (a->long_value.lv_tag | b->long_value.lv_tag) < (2 << NON_SIZE_BITS);
+}
+
+static inline bool
+_PyLong_IsZero(const PyLongObject *op)
+{
+    return (op->long_value.lv_tag & SIGN_MASK) == SIGN_ZERO;
+}
+
+static inline bool
+_PyLong_IsNegative(const PyLongObject *op)
+{
+    return (op->long_value.lv_tag & SIGN_MASK) == SIGN_NEGATIVE;
+}
+
+static inline bool
+_PyLong_IsPositive(const PyLongObject *op)
+{
+    return (op->long_value.lv_tag & SIGN_MASK) == 0;
+}
+
+static inline Py_ssize_t
+_PyLong_DigitCount(const PyLongObject *op)
+{
+    assert(PyLong_Check(op));
+    return op->long_value.lv_tag >> NON_SIZE_BITS;
+}
+
+/* Equivalent to _PyLong_DigitCount(op) * _PyLong_NonCompactSign(op) */
+static inline Py_ssize_t
+_PyLong_SignedDigitCount(const PyLongObject *op)
+{
+    assert(PyLong_Check(op));
+    Py_ssize_t sign = 1 - (op->long_value.lv_tag & SIGN_MASK);
+    return sign * (Py_ssize_t)(op->long_value.lv_tag >> NON_SIZE_BITS);
+}
+
+static inline int
+_PyLong_CompactSign(const PyLongObject *op)
+{
+    assert(PyLong_Check(op));
+    assert(_PyLong_IsCompact(op));
+    return 1 - (op->long_value.lv_tag & SIGN_MASK);
+}
+
+static inline int
+_PyLong_NonCompactSign(const PyLongObject *op)
+{
+    assert(PyLong_Check(op));
+    assert(!_PyLong_IsCompact(op));
+    return 1 - (op->long_value.lv_tag & SIGN_MASK);
+}
+
+/* Do a and b have the same sign? */
+static inline int
+_PyLong_SameSign(const PyLongObject *a, const PyLongObject *b)
+{
+    return (a->long_value.lv_tag & SIGN_MASK) == (b->long_value.lv_tag & SIGN_MASK);
+}
+
+#define TAG_FROM_SIGN_AND_SIZE(sign, size) ((1 - (sign)) | ((size) << NON_SIZE_BITS))
+
+static inline void
+_PyLong_SetSignAndDigitCount(PyLongObject *op, int sign, Py_ssize_t size)
+{
+    assert(size >= 0);
+    assert(-1 <= sign && sign <= 1);
+    assert(sign != 0 || size == 0);
+    op->long_value.lv_tag = TAG_FROM_SIGN_AND_SIZE(sign, (size_t)size);
+}
+
+static inline void
+_PyLong_SetDigitCount(PyLongObject *op, Py_ssize_t size)
+{
+    assert(size >= 0);
+    op->long_value.lv_tag = (((size_t)size) << NON_SIZE_BITS) | (op->long_value.lv_tag & SIGN_MASK);
+}
+
+#define NON_SIZE_MASK ~((1 << NON_SIZE_BITS) - 1)
+
+static inline void
+_PyLong_FlipSign(PyLongObject *op) {
+    unsigned int flipped_sign = 2 - (op->long_value.lv_tag & SIGN_MASK);
+    op->long_value.lv_tag &= NON_SIZE_MASK;
+    op->long_value.lv_tag |= flipped_sign;
+}
+
+#define _PyLong_DIGIT_INIT(val) \
+    { \
+        .ob_base = _PyObject_HEAD_INIT(&PyLong_Type), \
+        .long_value  = { \
+            .lv_tag = TAG_FROM_SIGN_AND_SIZE( \
+                (val) == 0 ? 0 : ((val) < 0 ? -1 : 1), \
+                (val) == 0 ? 0 : 1), \
+            { ((val) >= 0 ? (val) : -(val)) }, \
+        } \
+    }
+
+#define _PyLong_FALSE_TAG TAG_FROM_SIGN_AND_SIZE(0, 0)
+#define _PyLong_TRUE_TAG TAG_FROM_SIGN_AND_SIZE(1, 1)
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_LONG_H */
diff --git a/Include/internal/pycore_memoryobject.h b/Include/internal/pycore_memoryobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..62e204fcbf65339d252793e0ffd1e54b9f860ce3
--- /dev/null
+++ b/Include/internal/pycore_memoryobject.h
@@ -0,0 +1,20 @@
+#ifndef Py_INTERNAL_MEMORYOBJECT_H
+#define Py_INTERNAL_MEMORYOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern PyTypeObject _PyManagedBuffer_Type;
+
+PyObject *
+_PyMemoryView_FromBufferProc(PyObject *v, int flags,
+                             getbufferproc bufferproc);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_MEMORYOBJECT_H */
diff --git a/Include/internal/pycore_mimalloc.h b/Include/internal/pycore_mimalloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..d870d01beb702c060b7b81ff325a2900bc4d9012
--- /dev/null
+++ b/Include/internal/pycore_mimalloc.h
@@ -0,0 +1,69 @@
+#ifndef Py_INTERNAL_MIMALLOC_H
+#define Py_INTERNAL_MIMALLOC_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#if defined(MIMALLOC_H) || defined(MIMALLOC_TYPES_H)
+#  error "pycore_mimalloc.h must be included before mimalloc.h"
+#endif
+
+typedef enum {
+    _Py_MIMALLOC_HEAP_MEM = 0,      // PyMem_Malloc() and friends
+    _Py_MIMALLOC_HEAP_OBJECT = 1,   // non-GC objects
+    _Py_MIMALLOC_HEAP_GC = 2,       // GC objects without pre-header
+    _Py_MIMALLOC_HEAP_GC_PRE = 3,   // GC objects with pre-header
+    _Py_MIMALLOC_HEAP_COUNT
+} _Py_mimalloc_heap_id;
+
+#include "pycore_pymem.h"
+
+#ifdef WITH_MIMALLOC
+#  ifdef Py_GIL_DISABLED
+#    define MI_PRIM_THREAD_ID   _Py_ThreadId
+#  endif
+#  define MI_DEBUG_UNINIT     PYMEM_CLEANBYTE
+#  define MI_DEBUG_FREED      PYMEM_DEADBYTE
+#  define MI_DEBUG_PADDING    PYMEM_FORBIDDENBYTE
+#ifdef Py_DEBUG
+#  define MI_DEBUG 2
+#else
+#  define MI_DEBUG 0
+#endif
+
+#ifdef _Py_THREAD_SANITIZER
+#  define MI_TSAN 1
+#endif
+
+#ifdef __cplusplus
+extern "C++" {
+#endif
+
+#include "mimalloc/mimalloc.h"
+#include "mimalloc/mimalloc/types.h"
+#include "mimalloc/mimalloc/internal.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#ifdef Py_GIL_DISABLED
+struct _mimalloc_interp_state {
+    // When exiting, threads place any segments with live blocks in this
+    // shared pool for other threads to claim and reuse.
+    mi_abandoned_pool_t abandoned_pool;
+};
+
+struct _mimalloc_thread_state {
+    mi_heap_t *current_object_heap;
+    mi_heap_t heaps[_Py_MIMALLOC_HEAP_COUNT];
+    mi_tld_t tld;
+    int initialized;
+    struct llist_node page_list;
+};
+#endif
+
+#endif // Py_INTERNAL_MIMALLOC_H
diff --git a/Include/internal/pycore_modsupport.h b/Include/internal/pycore_modsupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..11fde814875938f23768d517a5434032502b9298
--- /dev/null
+++ b/Include/internal/pycore_modsupport.h
@@ -0,0 +1,107 @@
+#ifndef Py_INTERNAL_MODSUPPORT_H
+#define Py_INTERNAL_MODSUPPORT_H
+
+#include "pycore_lock.h"    // _PyOnceFlag
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+extern int _PyArg_NoKwnames(const char *funcname, PyObject *kwnames);
+#define _PyArg_NoKwnames(funcname, kwnames) \
+    ((kwnames) == NULL || _PyArg_NoKwnames((funcname), (kwnames)))
+
+// Export for '_bz2' shared extension
+PyAPI_FUNC(int) _PyArg_NoPositional(const char *funcname, PyObject *args);
+#define _PyArg_NoPositional(funcname, args) \
+    ((args) == NULL || _PyArg_NoPositional((funcname), (args)))
+
+// Export for '_asyncio' shared extension
+PyAPI_FUNC(int) _PyArg_NoKeywords(const char *funcname, PyObject *kwargs);
+#define _PyArg_NoKeywords(funcname, kwargs) \
+    ((kwargs) == NULL || _PyArg_NoKeywords((funcname), (kwargs)))
+
+// Export for 'zlib' shared extension
+PyAPI_FUNC(int) _PyArg_CheckPositional(const char *, Py_ssize_t,
+                                       Py_ssize_t, Py_ssize_t);
+#define _Py_ANY_VARARGS(n) ((n) == PY_SSIZE_T_MAX)
+#define _PyArg_CheckPositional(funcname, nargs, min, max) \
+    ((!_Py_ANY_VARARGS(max) && (min) <= (nargs) && (nargs) <= (max)) \
+     || _PyArg_CheckPositional((funcname), (nargs), (min), (max)))
+
+extern PyObject ** _Py_VaBuildStack(
+    PyObject **small_stack,
+    Py_ssize_t small_stack_len,
+    const char *format,
+    va_list va,
+    Py_ssize_t *p_nargs);
+
+extern PyObject* _PyModule_CreateInitialized(PyModuleDef*, int apiver);
+
+// Export for '_curses' shared extension
+PyAPI_FUNC(int) _PyArg_ParseStack(
+    PyObject *const *args,
+    Py_ssize_t nargs,
+    const char *format,
+    ...);
+
+extern int _PyArg_UnpackStack(
+    PyObject *const *args,
+    Py_ssize_t nargs,
+    const char *name,
+    Py_ssize_t min,
+    Py_ssize_t max,
+    ...);
+
+// Export for '_heapq' shared extension
+PyAPI_FUNC(void) _PyArg_BadArgument(
+    const char *fname,
+    const char *displayname,
+    const char *expected,
+    PyObject *arg);
+
+// --- _PyArg_Parser API ---------------------------------------------------
+
+// Export for '_dbm' shared extension
+PyAPI_FUNC(int) _PyArg_ParseStackAndKeywords(
+    PyObject *const *args,
+    Py_ssize_t nargs,
+    PyObject *kwnames,
+    struct _PyArg_Parser *,
+    ...);
+
+// Export for 'math' shared extension
+PyAPI_FUNC(PyObject * const *) _PyArg_UnpackKeywords(
+    PyObject *const *args,
+    Py_ssize_t nargs,
+    PyObject *kwargs,
+    PyObject *kwnames,
+    struct _PyArg_Parser *parser,
+    int minpos,
+    int maxpos,
+    int minkw,
+    PyObject **buf);
+#define _PyArg_UnpackKeywords(args, nargs, kwargs, kwnames, parser, minpos, maxpos, minkw, buf) \
+    (((minkw) == 0 && (kwargs) == NULL && (kwnames) == NULL && \
+      (minpos) <= (nargs) && (nargs) <= (maxpos) && (args) != NULL) ? (args) : \
+     _PyArg_UnpackKeywords((args), (nargs), (kwargs), (kwnames), (parser), \
+                           (minpos), (maxpos), (minkw), (buf)))
+
+// Export for '_testclinic' shared extension
+PyAPI_FUNC(PyObject * const *) _PyArg_UnpackKeywordsWithVararg(
+        PyObject *const *args, Py_ssize_t nargs,
+        PyObject *kwargs, PyObject *kwnames,
+        struct _PyArg_Parser *parser,
+        int minpos, int maxpos, int minkw,
+        int vararg, PyObject **buf);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_MODSUPPORT_H
+
diff --git a/Include/internal/pycore_moduleobject.h b/Include/internal/pycore_moduleobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..dacc00dba54495136067bf39955862af54087dab
--- /dev/null
+++ b/Include/internal/pycore_moduleobject.h
@@ -0,0 +1,56 @@
+#ifndef Py_INTERNAL_MODULEOBJECT_H
+#define Py_INTERNAL_MODULEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern void _PyModule_Clear(PyObject *);
+extern void _PyModule_ClearDict(PyObject *);
+extern int _PyModuleSpec_IsInitializing(PyObject *);
+extern int _PyModuleSpec_GetFileOrigin(PyObject *, PyObject **);
+extern int _PyModule_IsPossiblyShadowing(PyObject *);
+
+extern int _PyModule_IsExtension(PyObject *obj);
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *md_dict;
+    PyModuleDef *md_def;
+    void *md_state;
+    PyObject *md_weaklist;
+    // for logging purposes after md_dict is cleared
+    PyObject *md_name;
+#ifdef Py_GIL_DISABLED
+    void *md_gil;
+#endif
+} PyModuleObject;
+
+static inline PyModuleDef* _PyModule_GetDef(PyObject *mod) {
+    assert(PyModule_Check(mod));
+    return ((PyModuleObject *)mod)->md_def;
+}
+
+static inline void* _PyModule_GetState(PyObject* mod) {
+    assert(PyModule_Check(mod));
+    return ((PyModuleObject *)mod)->md_state;
+}
+
+static inline PyObject* _PyModule_GetDict(PyObject *mod) {
+    assert(PyModule_Check(mod));
+    PyObject *dict = ((PyModuleObject *)mod) -> md_dict;
+    // _PyModule_GetDict(mod) must not be used after calling module_clear(mod)
+    assert(dict != NULL);
+    return dict;  // borrowed reference
+}
+
+PyObject* _Py_module_getattro_impl(PyModuleObject *m, PyObject *name, int suppress);
+PyObject* _Py_module_getattro(PyModuleObject *m, PyObject *name);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_MODULEOBJECT_H */
diff --git a/Include/internal/pycore_namespace.h b/Include/internal/pycore_namespace.h
new file mode 100644
index 0000000000000000000000000000000000000000..f165cf15319a599b1aab816a6a7bca128159a6a9
--- /dev/null
+++ b/Include/internal/pycore_namespace.h
@@ -0,0 +1,21 @@
+// Simple namespace object interface
+
+#ifndef Py_INTERNAL_NAMESPACE_H
+#define Py_INTERNAL_NAMESPACE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern PyTypeObject _PyNamespace_Type;
+
+// Export for '_testmultiphase' shared extension
+PyAPI_FUNC(PyObject*) _PyNamespace_New(PyObject *kwds);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_NAMESPACE_H
diff --git a/Include/internal/pycore_object.h b/Include/internal/pycore_object.h
new file mode 100644
index 0000000000000000000000000000000000000000..5877d43f4fd5a4d9d5b2c989c9ef9f8245805e12
--- /dev/null
+++ b/Include/internal/pycore_object.h
@@ -0,0 +1,867 @@
+#ifndef Py_INTERNAL_OBJECT_H
+#define Py_INTERNAL_OBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <stdbool.h>
+#include "pycore_gc.h"            // _PyObject_GC_IS_TRACKED()
+#include "pycore_emscripten_trampoline.h" // _PyCFunction_TrampolineCall()
+#include "pycore_interp.h"        // PyInterpreterState.gc
+#include "pycore_pyatomic_ft_wrappers.h"  // FT_ATOMIC_STORE_PTR_RELAXED
+#include "pycore_pystate.h"       // _PyInterpreterState_GET()
+
+
+#define _Py_IMMORTAL_REFCNT_LOOSE ((_Py_IMMORTAL_REFCNT >> 1) + 1)
+
+// gh-121528, gh-118997: Similar to _Py_IsImmortal() but be more loose when
+// comparing the reference count to stay compatible with C extensions built
+// with the stable ABI 3.11 or older. Such extensions implement INCREF/DECREF
+// as refcnt++ and refcnt-- without taking in account immortal objects. For
+// example, the reference count of an immortal object can change from
+// _Py_IMMORTAL_REFCNT to _Py_IMMORTAL_REFCNT+1 (INCREF) or
+// _Py_IMMORTAL_REFCNT-1 (DECREF).
+//
+// This function should only be used in assertions. Otherwise, _Py_IsImmortal()
+// must be used instead.
+static inline int _Py_IsImmortalLoose(PyObject *op)
+{
+#if defined(Py_GIL_DISABLED)
+    return _Py_IsImmortal(op);
+#else
+    return (op->ob_refcnt >= _Py_IMMORTAL_REFCNT_LOOSE);
+#endif
+}
+#define _Py_IsImmortalLoose(op) _Py_IsImmortalLoose(_PyObject_CAST(op))
+
+
+/* Check if an object is consistent. For example, ensure that the reference
+   counter is greater than or equal to 1, and ensure that ob_type is not NULL.
+
+   Call _PyObject_AssertFailed() if the object is inconsistent.
+
+   If check_content is zero, only check header fields: reduce the overhead.
+
+   The function always return 1. The return value is just here to be able to
+   write:
+
+   assert(_PyObject_CheckConsistency(obj, 1)); */
+extern int _PyObject_CheckConsistency(PyObject *op, int check_content);
+
+extern void _PyDebugAllocatorStats(FILE *out, const char *block_name,
+                                   int num_blocks, size_t sizeof_block);
+
+extern void _PyObject_DebugTypeStats(FILE *out);
+
+#ifdef Py_TRACE_REFS
+// Forget a reference registered by _Py_NewReference(). Function called by
+// _Py_Dealloc().
+//
+// On a free list, the function can be used before modifying an object to
+// remove the object from traced objects. Then _Py_NewReference() or
+// _Py_NewReferenceNoTotal() should be called again on the object to trace
+// it again.
+extern void _Py_ForgetReference(PyObject *);
+#endif
+
+// Export for shared _testinternalcapi extension
+PyAPI_FUNC(int) _PyObject_IsFreed(PyObject *);
+
+/* We need to maintain an internal copy of Py{Var}Object_HEAD_INIT to avoid
+   designated initializer conflicts in C++20. If we use the definition in
+   object.h, we will be mixing designated and non-designated initializers in
+   pycore objects which is forbiddent in C++20. However, if we then use
+   designated initializers in object.h then Extensions without designated break.
+   Furthermore, we can't use designated initializers in Extensions since these
+   are not supported pre-C++20. Thus, keeping an internal copy here is the most
+   backwards compatible solution */
+#if defined(Py_GIL_DISABLED)
+#define _PyObject_HEAD_INIT(type)                   \
+    {                                               \
+        .ob_ref_local = _Py_IMMORTAL_REFCNT_LOCAL,  \
+        .ob_type = (type)                           \
+    }
+#else
+#define _PyObject_HEAD_INIT(type)         \
+    {                                     \
+        .ob_refcnt = _Py_IMMORTAL_REFCNT, \
+        .ob_type = (type)                 \
+    }
+#endif
+#define _PyVarObject_HEAD_INIT(type, size)    \
+    {                                         \
+        .ob_base = _PyObject_HEAD_INIT(type), \
+        .ob_size = size                       \
+    }
+
+PyAPI_FUNC(void) _Py_NO_RETURN _Py_FatalRefcountErrorFunc(
+    const char *func,
+    const char *message);
+
+#define _Py_FatalRefcountError(message) \
+    _Py_FatalRefcountErrorFunc(__func__, (message))
+
+#define _PyReftracerTrack(obj, operation) \
+    do { \
+        struct _reftracer_runtime_state *tracer = &_PyRuntime.ref_tracer; \
+        if (tracer->tracer_func != NULL) { \
+            void *data = tracer->tracer_data; \
+            tracer->tracer_func((obj), (operation), data); \
+        } \
+    } while(0)
+
+#ifdef Py_REF_DEBUG
+/* The symbol is only exposed in the API for the sake of extensions
+   built against the pre-3.12 stable ABI. */
+PyAPI_DATA(Py_ssize_t) _Py_RefTotal;
+
+extern void _Py_AddRefTotal(PyThreadState *, Py_ssize_t);
+extern void _Py_IncRefTotal(PyThreadState *);
+extern void _Py_DecRefTotal(PyThreadState *);
+
+#  define _Py_DEC_REFTOTAL(interp) \
+    interp->object_state.reftotal--
+#endif
+
+// Increment reference count by n
+static inline void _Py_RefcntAdd(PyObject* op, Py_ssize_t n)
+{
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+#ifdef Py_REF_DEBUG
+    _Py_AddRefTotal(_PyThreadState_GET(), n);
+#endif
+#if !defined(Py_GIL_DISABLED)
+    op->ob_refcnt += n;
+#else
+    if (_Py_IsOwnedByCurrentThread(op)) {
+        uint32_t local = op->ob_ref_local;
+        Py_ssize_t refcnt = (Py_ssize_t)local + n;
+#  if PY_SSIZE_T_MAX > UINT32_MAX
+        if (refcnt > (Py_ssize_t)UINT32_MAX) {
+            // Make the object immortal if the 32-bit local reference count
+            // would overflow.
+            refcnt = _Py_IMMORTAL_REFCNT_LOCAL;
+        }
+#  endif
+        _Py_atomic_store_uint32_relaxed(&op->ob_ref_local, (uint32_t)refcnt);
+    }
+    else {
+        _Py_atomic_add_ssize(&op->ob_ref_shared, (n << _Py_REF_SHARED_SHIFT));
+    }
+#endif
+    // Although the ref count was increased by `n` (which may be greater than 1)
+    // it is only a single increment (i.e. addition) operation, so only 1 refcnt
+    // increment operation is counted.
+    _Py_INCREF_STAT_INC();
+}
+#define _Py_RefcntAdd(op, n) _Py_RefcntAdd(_PyObject_CAST(op), n)
+
+extern void _Py_SetImmortal(PyObject *op);
+extern void _Py_SetImmortalUntracked(PyObject *op);
+
+// Checks if an object has a single, unique reference. If the caller holds a
+// unique reference, it may be able to safely modify the object in-place.
+static inline int
+_PyObject_IsUniquelyReferenced(PyObject *ob)
+{
+#if !defined(Py_GIL_DISABLED)
+    return Py_REFCNT(ob) == 1;
+#else
+    // NOTE: the entire ob_ref_shared field must be zero, including flags, to
+    // ensure that other threads cannot concurrently create new references to
+    // this object.
+    return (_Py_IsOwnedByCurrentThread(ob) &&
+            _Py_atomic_load_uint32_relaxed(&ob->ob_ref_local) == 1 &&
+            _Py_atomic_load_ssize_relaxed(&ob->ob_ref_shared) == 0);
+#endif
+}
+
+// Makes an immortal object mortal again with the specified refcnt. Should only
+// be used during runtime finalization.
+static inline void _Py_SetMortal(PyObject *op, Py_ssize_t refcnt)
+{
+    if (op) {
+        assert(_Py_IsImmortalLoose(op));
+#ifdef Py_GIL_DISABLED
+        op->ob_tid = _Py_UNOWNED_TID;
+        op->ob_ref_local = 0;
+        op->ob_ref_shared = _Py_REF_SHARED(refcnt, _Py_REF_MERGED);
+#else
+        op->ob_refcnt = refcnt;
+#endif
+    }
+}
+
+/* _Py_ClearImmortal() should only be used during runtime finalization. */
+static inline void _Py_ClearImmortal(PyObject *op)
+{
+    if (op) {
+        _Py_SetMortal(op, 1);
+        Py_DECREF(op);
+    }
+}
+#define _Py_ClearImmortal(op) \
+    do { \
+        _Py_ClearImmortal(_PyObject_CAST(op)); \
+        op = NULL; \
+    } while (0)
+
+// Mark an object as supporting deferred reference counting. This is a no-op
+// in the default (with GIL) build. Objects that use deferred reference
+// counting should be tracked by the GC so that they are eventually collected.
+extern void _PyObject_SetDeferredRefcount(PyObject *op);
+
+static inline int
+_PyObject_HasDeferredRefcount(PyObject *op)
+{
+#ifdef Py_GIL_DISABLED
+    return _PyObject_HAS_GC_BITS(op, _PyGC_BITS_DEFERRED);
+#else
+    return 0;
+#endif
+}
+
+#if !defined(Py_GIL_DISABLED)
+static inline void
+_Py_DECREF_SPECIALIZED(PyObject *op, const destructor destruct)
+{
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+#ifdef Py_REF_DEBUG
+    _Py_DEC_REFTOTAL(PyInterpreterState_Get());
+#endif
+    if (--op->ob_refcnt != 0) {
+        assert(op->ob_refcnt > 0);
+    }
+    else {
+#ifdef Py_TRACE_REFS
+        _Py_ForgetReference(op);
+#endif
+        _PyReftracerTrack(op, PyRefTracer_DESTROY);
+        destruct(op);
+    }
+}
+
+static inline void
+_Py_DECREF_NO_DEALLOC(PyObject *op)
+{
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+#ifdef Py_REF_DEBUG
+    _Py_DEC_REFTOTAL(PyInterpreterState_Get());
+#endif
+    op->ob_refcnt--;
+#ifdef Py_DEBUG
+    if (op->ob_refcnt <= 0) {
+        _Py_FatalRefcountError("Expected a positive remaining refcount");
+    }
+#endif
+}
+
+#else
+// TODO: implement Py_DECREF specializations for Py_GIL_DISABLED build
+static inline void
+_Py_DECREF_SPECIALIZED(PyObject *op, const destructor destruct)
+{
+    Py_DECREF(op);
+}
+
+static inline void
+_Py_DECREF_NO_DEALLOC(PyObject *op)
+{
+    Py_DECREF(op);
+}
+
+static inline int
+_Py_REF_IS_MERGED(Py_ssize_t ob_ref_shared)
+{
+    return (ob_ref_shared & _Py_REF_SHARED_FLAG_MASK) == _Py_REF_MERGED;
+}
+
+static inline int
+_Py_REF_IS_QUEUED(Py_ssize_t ob_ref_shared)
+{
+    return (ob_ref_shared & _Py_REF_SHARED_FLAG_MASK) == _Py_REF_QUEUED;
+}
+
+// Merge the local and shared reference count fields and add `extra` to the
+// refcount when merging.
+Py_ssize_t _Py_ExplicitMergeRefcount(PyObject *op, Py_ssize_t extra);
+#endif // !defined(Py_GIL_DISABLED)
+
+#ifdef Py_REF_DEBUG
+#  undef _Py_DEC_REFTOTAL
+#endif
+
+
+extern int _PyType_CheckConsistency(PyTypeObject *type);
+extern int _PyDict_CheckConsistency(PyObject *mp, int check_content);
+
+/* Update the Python traceback of an object. This function must be called
+   when a memory block is reused from a free list.
+
+   Internal function called by _Py_NewReference(). */
+extern int _PyTraceMalloc_TraceRef(PyObject *op, PyRefTracerEvent event, void*);
+
+// Fast inlined version of PyType_HasFeature()
+static inline int
+_PyType_HasFeature(PyTypeObject *type, unsigned long feature) {
+    return ((FT_ATOMIC_LOAD_ULONG_RELAXED(type->tp_flags) & feature) != 0);
+}
+
+extern void _PyType_InitCache(PyInterpreterState *interp);
+
+extern PyStatus _PyObject_InitState(PyInterpreterState *interp);
+extern void _PyObject_FiniState(PyInterpreterState *interp);
+extern bool _PyRefchain_IsTraced(PyInterpreterState *interp, PyObject *obj);
+
+/* Inline functions trading binary compatibility for speed:
+   _PyObject_Init() is the fast version of PyObject_Init(), and
+   _PyObject_InitVar() is the fast version of PyObject_InitVar().
+
+   These inline functions must not be called with op=NULL. */
+static inline void
+_PyObject_Init(PyObject *op, PyTypeObject *typeobj)
+{
+    assert(op != NULL);
+    Py_SET_TYPE(op, typeobj);
+    assert(_PyType_HasFeature(typeobj, Py_TPFLAGS_HEAPTYPE) || _Py_IsImmortalLoose(typeobj));
+    Py_INCREF(typeobj);
+    _Py_NewReference(op);
+}
+
+static inline void
+_PyObject_InitVar(PyVarObject *op, PyTypeObject *typeobj, Py_ssize_t size)
+{
+    assert(op != NULL);
+    assert(typeobj != &PyLong_Type);
+    _PyObject_Init((PyObject *)op, typeobj);
+    Py_SET_SIZE(op, size);
+}
+
+
+/* Tell the GC to track this object.
+ *
+ * The object must not be tracked by the GC.
+ *
+ * NB: While the object is tracked by the collector, it must be safe to call the
+ * ob_traverse method.
+ *
+ * Internal note: interp->gc.generation0->_gc_prev doesn't have any bit flags
+ * because it's not object header.  So we don't use _PyGCHead_PREV() and
+ * _PyGCHead_SET_PREV() for it to avoid unnecessary bitwise operations.
+ *
+ * See also the public PyObject_GC_Track() function.
+ */
+static inline void _PyObject_GC_TRACK(
+// The preprocessor removes _PyObject_ASSERT_FROM() calls if NDEBUG is defined
+#ifndef NDEBUG
+    const char *filename, int lineno,
+#endif
+    PyObject *op)
+{
+    _PyObject_ASSERT_FROM(op, !_PyObject_GC_IS_TRACKED(op),
+                          "object already tracked by the garbage collector",
+                          filename, lineno, __func__);
+#ifdef Py_GIL_DISABLED
+    _PyObject_SET_GC_BITS(op, _PyGC_BITS_TRACKED);
+#else
+    PyGC_Head *gc = _Py_AS_GC(op);
+    _PyObject_ASSERT_FROM(op,
+                          (gc->_gc_prev & _PyGC_PREV_MASK_COLLECTING) == 0,
+                          "object is in generation which is garbage collected",
+                          filename, lineno, __func__);
+
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    PyGC_Head *generation0 = interp->gc.generation0;
+    PyGC_Head *last = (PyGC_Head*)(generation0->_gc_prev);
+    _PyGCHead_SET_NEXT(last, gc);
+    _PyGCHead_SET_PREV(gc, last);
+    _PyGCHead_SET_NEXT(gc, generation0);
+    generation0->_gc_prev = (uintptr_t)gc;
+#endif
+}
+
+/* Tell the GC to stop tracking this object.
+ *
+ * Internal note: This may be called while GC. So _PyGC_PREV_MASK_COLLECTING
+ * must be cleared. But _PyGC_PREV_MASK_FINALIZED bit is kept.
+ *
+ * The object must be tracked by the GC.
+ *
+ * See also the public PyObject_GC_UnTrack() which accept an object which is
+ * not tracked.
+ */
+static inline void _PyObject_GC_UNTRACK(
+// The preprocessor removes _PyObject_ASSERT_FROM() calls if NDEBUG is defined
+#ifndef NDEBUG
+    const char *filename, int lineno,
+#endif
+    PyObject *op)
+{
+    _PyObject_ASSERT_FROM(op, _PyObject_GC_IS_TRACKED(op),
+                          "object not tracked by the garbage collector",
+                          filename, lineno, __func__);
+
+#ifdef Py_GIL_DISABLED
+    _PyObject_CLEAR_GC_BITS(op, _PyGC_BITS_TRACKED);
+#else
+    PyGC_Head *gc = _Py_AS_GC(op);
+    PyGC_Head *prev = _PyGCHead_PREV(gc);
+    PyGC_Head *next = _PyGCHead_NEXT(gc);
+    _PyGCHead_SET_NEXT(prev, next);
+    _PyGCHead_SET_PREV(next, prev);
+    gc->_gc_next = 0;
+    gc->_gc_prev &= _PyGC_PREV_MASK_FINALIZED;
+#endif
+}
+
+// Macros to accept any type for the parameter, and to automatically pass
+// the filename and the filename (if NDEBUG is not defined) where the macro
+// is called.
+#ifdef NDEBUG
+#  define _PyObject_GC_TRACK(op) \
+        _PyObject_GC_TRACK(_PyObject_CAST(op))
+#  define _PyObject_GC_UNTRACK(op) \
+        _PyObject_GC_UNTRACK(_PyObject_CAST(op))
+#else
+#  define _PyObject_GC_TRACK(op) \
+        _PyObject_GC_TRACK(__FILE__, __LINE__, _PyObject_CAST(op))
+#  define _PyObject_GC_UNTRACK(op) \
+        _PyObject_GC_UNTRACK(__FILE__, __LINE__, _PyObject_CAST(op))
+#endif
+
+#ifdef Py_GIL_DISABLED
+
+/* Tries to increment an object's reference count
+ *
+ * This is a specialized version of _Py_TryIncref that only succeeds if the
+ * object is immortal or local to this thread. It does not handle the case
+ * where the  reference count modification requires an atomic operation. This
+ * allows call sites to specialize for the immortal/local case.
+ */
+static inline int
+_Py_TryIncrefFast(PyObject *op) {
+    uint32_t local = _Py_atomic_load_uint32_relaxed(&op->ob_ref_local);
+    local += 1;
+    if (local == 0) {
+        // immortal
+        return 1;
+    }
+    if (_Py_IsOwnedByCurrentThread(op)) {
+        _Py_INCREF_STAT_INC();
+        _Py_atomic_store_uint32_relaxed(&op->ob_ref_local, local);
+#ifdef Py_REF_DEBUG
+        _Py_IncRefTotal(_PyThreadState_GET());
+#endif
+        return 1;
+    }
+    return 0;
+}
+
+static inline int
+_Py_TryIncRefShared(PyObject *op)
+{
+    Py_ssize_t shared = _Py_atomic_load_ssize_relaxed(&op->ob_ref_shared);
+    for (;;) {
+        // If the shared refcount is zero and the object is either merged
+        // or may not have weak references, then we cannot incref it.
+        if (shared == 0 || shared == _Py_REF_MERGED) {
+            return 0;
+        }
+
+        if (_Py_atomic_compare_exchange_ssize(
+                &op->ob_ref_shared,
+                &shared,
+                shared + (1 << _Py_REF_SHARED_SHIFT))) {
+#ifdef Py_REF_DEBUG
+            _Py_IncRefTotal(_PyThreadState_GET());
+#endif
+            _Py_INCREF_STAT_INC();
+            return 1;
+        }
+    }
+}
+
+/* Tries to incref the object op and ensures that *src still points to it. */
+static inline int
+_Py_TryIncrefCompare(PyObject **src, PyObject *op)
+{
+    if (_Py_TryIncrefFast(op)) {
+        return 1;
+    }
+    if (!_Py_TryIncRefShared(op)) {
+        return 0;
+    }
+    if (op != _Py_atomic_load_ptr(src)) {
+        Py_DECREF(op);
+        return 0;
+    }
+    return 1;
+}
+
+/* Loads and increfs an object from ptr, which may contain a NULL value.
+   Safe with concurrent (atomic) updates to ptr.
+   NOTE: The writer must set maybe-weakref on the stored object! */
+static inline PyObject *
+_Py_XGetRef(PyObject **ptr)
+{
+    for (;;) {
+        PyObject *value = _Py_atomic_load_ptr(ptr);
+        if (value == NULL) {
+            return value;
+        }
+        if (_Py_TryIncrefCompare(ptr, value)) {
+            return value;
+        }
+    }
+}
+
+/* Attempts to loads and increfs an object from ptr. Returns NULL
+   on failure, which may be due to a NULL value or a concurrent update. */
+static inline PyObject *
+_Py_TryXGetRef(PyObject **ptr)
+{
+    PyObject *value = _Py_atomic_load_ptr(ptr);
+    if (value == NULL) {
+        return value;
+    }
+    if (_Py_TryIncrefCompare(ptr, value)) {
+        return value;
+    }
+    return NULL;
+}
+
+/* Like Py_NewRef but also optimistically sets _Py_REF_MAYBE_WEAKREF
+   on objects owned by a different thread. */
+static inline PyObject *
+_Py_NewRefWithLock(PyObject *op)
+{
+    if (_Py_TryIncrefFast(op)) {
+        return op;
+    }
+#ifdef Py_REF_DEBUG
+    _Py_IncRefTotal(_PyThreadState_GET());
+#endif
+    _Py_INCREF_STAT_INC();
+    for (;;) {
+        Py_ssize_t shared = _Py_atomic_load_ssize_relaxed(&op->ob_ref_shared);
+        Py_ssize_t new_shared = shared + (1 << _Py_REF_SHARED_SHIFT);
+        if ((shared & _Py_REF_SHARED_FLAG_MASK) == 0) {
+            new_shared |= _Py_REF_MAYBE_WEAKREF;
+        }
+        if (_Py_atomic_compare_exchange_ssize(
+                &op->ob_ref_shared,
+                &shared,
+                new_shared)) {
+            return op;
+        }
+    }
+}
+
+static inline PyObject *
+_Py_XNewRefWithLock(PyObject *obj)
+{
+    if (obj == NULL) {
+        return NULL;
+    }
+    return _Py_NewRefWithLock(obj);
+}
+
+static inline void
+_PyObject_SetMaybeWeakref(PyObject *op)
+{
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    for (;;) {
+        Py_ssize_t shared = _Py_atomic_load_ssize_relaxed(&op->ob_ref_shared);
+        if ((shared & _Py_REF_SHARED_FLAG_MASK) != 0) {
+            // Nothing to do if it's in WEAKREFS, QUEUED, or MERGED states.
+            return;
+        }
+        if (_Py_atomic_compare_exchange_ssize(
+                &op->ob_ref_shared, &shared, shared | _Py_REF_MAYBE_WEAKREF)) {
+            return;
+        }
+    }
+}
+
+extern int _PyObject_ResurrectEndSlow(PyObject *op);
+#endif
+
+// Temporarily resurrects an object during deallocation. The refcount is set
+// to one.
+static inline void
+_PyObject_ResurrectStart(PyObject *op)
+{
+    assert(Py_REFCNT(op) == 0);
+#ifdef Py_REF_DEBUG
+    _Py_IncRefTotal(_PyThreadState_GET());
+#endif
+#ifdef Py_GIL_DISABLED
+    _Py_atomic_store_uintptr_relaxed(&op->ob_tid, _Py_ThreadId());
+    _Py_atomic_store_uint32_relaxed(&op->ob_ref_local, 1);
+    _Py_atomic_store_ssize_relaxed(&op->ob_ref_shared, 0);
+#else
+    Py_SET_REFCNT(op, 1);
+#endif
+}
+
+// Undoes an object resurrection by decrementing the refcount without calling
+// _Py_Dealloc(). Returns 0 if the object is dead (the normal case), and
+// deallocation should continue. Returns 1 if the object is still alive.
+static inline int
+_PyObject_ResurrectEnd(PyObject *op)
+{
+#ifdef Py_REF_DEBUG
+    _Py_DecRefTotal(_PyThreadState_GET());
+#endif
+#ifndef Py_GIL_DISABLED
+    Py_SET_REFCNT(op, Py_REFCNT(op) - 1);
+    return Py_REFCNT(op) != 0;
+#else
+    uint32_t local = _Py_atomic_load_uint32_relaxed(&op->ob_ref_local);
+    Py_ssize_t shared = _Py_atomic_load_ssize_acquire(&op->ob_ref_shared);
+    if (_Py_IsOwnedByCurrentThread(op) && local == 1 && shared == 0) {
+        // Fast-path: object has a single refcount and is owned by this thread
+        _Py_atomic_store_uint32_relaxed(&op->ob_ref_local, 0);
+        return 0;
+    }
+    // Slow-path: object has a shared refcount or is not owned by this thread
+    return _PyObject_ResurrectEndSlow(op);
+#endif
+}
+
+/* Tries to incref op and returns 1 if successful or 0 otherwise. */
+static inline int
+_Py_TryIncref(PyObject *op)
+{
+#ifdef Py_GIL_DISABLED
+    return _Py_TryIncrefFast(op) || _Py_TryIncRefShared(op);
+#else
+    if (Py_REFCNT(op) > 0) {
+        Py_INCREF(op);
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#ifdef Py_REF_DEBUG
+extern void _PyInterpreterState_FinalizeRefTotal(PyInterpreterState *);
+extern void _Py_FinalizeRefTotal(_PyRuntimeState *);
+extern void _PyDebug_PrintTotalRefs(void);
+#endif
+
+#ifdef Py_TRACE_REFS
+extern void _Py_AddToAllObjects(PyObject *op);
+extern void _Py_PrintReferences(PyInterpreterState *, FILE *);
+extern void _Py_PrintReferenceAddresses(PyInterpreterState *, FILE *);
+#endif
+
+
+/* Return the *address* of the object's weaklist.  The address may be
+ * dereferenced to get the current head of the weaklist.  This is useful
+ * for iterating over the linked list of weakrefs, especially when the
+ * list is being modified externally (e.g. refs getting removed).
+ *
+ * The returned pointer should not be used to change the head of the list
+ * nor should it be used to add, remove, or swap any refs in the list.
+ * That is the sole responsibility of the code in weakrefobject.c.
+ */
+static inline PyObject **
+_PyObject_GET_WEAKREFS_LISTPTR(PyObject *op)
+{
+    if (PyType_Check(op) &&
+            ((PyTypeObject *)op)->tp_flags & _Py_TPFLAGS_STATIC_BUILTIN) {
+        PyInterpreterState *interp = _PyInterpreterState_GET();
+        managed_static_type_state *state = _PyStaticType_GetState(
+                                                interp, (PyTypeObject *)op);
+        return _PyStaticType_GET_WEAKREFS_LISTPTR(state);
+    }
+    // Essentially _PyObject_GET_WEAKREFS_LISTPTR_FROM_OFFSET():
+    Py_ssize_t offset = Py_TYPE(op)->tp_weaklistoffset;
+    return (PyObject **)((char *)op + offset);
+}
+
+/* This is a special case of _PyObject_GET_WEAKREFS_LISTPTR().
+ * Only the most fundamental lookup path is used.
+ * Consequently, static types should not be used.
+ *
+ * For static builtin types the returned pointer will always point
+ * to a NULL tp_weaklist.  This is fine for any deallocation cases,
+ * since static types are never deallocated and static builtin types
+ * are only finalized at the end of runtime finalization.
+ *
+ * If the weaklist for static types is actually needed then use
+ * _PyObject_GET_WEAKREFS_LISTPTR().
+ */
+static inline PyWeakReference **
+_PyObject_GET_WEAKREFS_LISTPTR_FROM_OFFSET(PyObject *op)
+{
+    assert(!PyType_Check(op) ||
+            ((PyTypeObject *)op)->tp_flags & Py_TPFLAGS_HEAPTYPE);
+    Py_ssize_t offset = Py_TYPE(op)->tp_weaklistoffset;
+    return (PyWeakReference **)((char *)op + offset);
+}
+
+// Fast inlined version of PyObject_IS_GC()
+static inline int
+_PyObject_IS_GC(PyObject *obj)
+{
+    PyTypeObject *type = Py_TYPE(obj);
+    return (PyType_IS_GC(type)
+            && (type->tp_is_gc == NULL || type->tp_is_gc(obj)));
+}
+
+// Fast inlined version of PyObject_Hash()
+static inline Py_hash_t
+_PyObject_HashFast(PyObject *op)
+{
+    if (PyUnicode_CheckExact(op)) {
+        Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(
+                             _PyASCIIObject_CAST(op)->hash);
+        if (hash != -1) {
+            return hash;
+        }
+    }
+    return PyObject_Hash(op);
+}
+
+// Fast inlined version of PyType_IS_GC()
+#define _PyType_IS_GC(t) _PyType_HasFeature((t), Py_TPFLAGS_HAVE_GC)
+
+static inline size_t
+_PyType_PreHeaderSize(PyTypeObject *tp)
+{
+    return (
+#ifndef Py_GIL_DISABLED
+        _PyType_IS_GC(tp) * sizeof(PyGC_Head) +
+#endif
+        _PyType_HasFeature(tp, Py_TPFLAGS_PREHEADER) * 2 * sizeof(PyObject *)
+    );
+}
+
+void _PyObject_GC_Link(PyObject *op);
+
+// Usage: assert(_Py_CheckSlotResult(obj, "__getitem__", result != NULL));
+extern int _Py_CheckSlotResult(
+    PyObject *obj,
+    const char *slot_name,
+    int success);
+
+// Test if a type supports weak references
+static inline int _PyType_SUPPORTS_WEAKREFS(PyTypeObject *type) {
+    return (type->tp_weaklistoffset != 0);
+}
+
+extern PyObject* _PyType_AllocNoTrack(PyTypeObject *type, Py_ssize_t nitems);
+extern PyObject *_PyType_NewManagedObject(PyTypeObject *type);
+
+extern PyTypeObject* _PyType_CalculateMetaclass(PyTypeObject *, PyObject *);
+extern PyObject* _PyType_GetDocFromInternalDoc(const char *, const char *);
+extern PyObject* _PyType_GetTextSignatureFromInternalDoc(const char *, const char *, int);
+extern int _PyObject_SetAttributeErrorContext(PyObject *v, PyObject* name);
+
+void _PyObject_InitInlineValues(PyObject *obj, PyTypeObject *tp);
+extern int _PyObject_StoreInstanceAttribute(PyObject *obj,
+                                            PyObject *name, PyObject *value);
+extern bool _PyObject_TryGetInstanceAttribute(PyObject *obj, PyObject *name,
+                                              PyObject **attr);
+
+#ifdef Py_GIL_DISABLED
+#  define MANAGED_DICT_OFFSET    (((Py_ssize_t)sizeof(PyObject *))*-1)
+#  define MANAGED_WEAKREF_OFFSET (((Py_ssize_t)sizeof(PyObject *))*-2)
+#else
+#  define MANAGED_DICT_OFFSET    (((Py_ssize_t)sizeof(PyObject *))*-3)
+#  define MANAGED_WEAKREF_OFFSET (((Py_ssize_t)sizeof(PyObject *))*-4)
+#endif
+
+typedef union {
+    PyDictObject *dict;
+} PyManagedDictPointer;
+
+static inline PyManagedDictPointer *
+_PyObject_ManagedDictPointer(PyObject *obj)
+{
+    assert(Py_TYPE(obj)->tp_flags & Py_TPFLAGS_MANAGED_DICT);
+    return (PyManagedDictPointer *)((char *)obj + MANAGED_DICT_OFFSET);
+}
+
+static inline PyDictObject *
+_PyObject_GetManagedDict(PyObject *obj)
+{
+    PyManagedDictPointer *dorv = _PyObject_ManagedDictPointer(obj);
+    return (PyDictObject *)FT_ATOMIC_LOAD_PTR_ACQUIRE(dorv->dict);
+}
+
+static inline PyDictValues *
+_PyObject_InlineValues(PyObject *obj)
+{
+    assert(Py_TYPE(obj)->tp_flags & Py_TPFLAGS_INLINE_VALUES);
+    assert(Py_TYPE(obj)->tp_flags & Py_TPFLAGS_MANAGED_DICT);
+    assert(Py_TYPE(obj)->tp_basicsize == sizeof(PyObject));
+    return (PyDictValues *)((char *)obj + sizeof(PyObject));
+}
+
+extern PyObject ** _PyObject_ComputedDictPointer(PyObject *);
+extern int _PyObject_IsInstanceDictEmpty(PyObject *);
+
+// Export for 'math' shared extension
+PyAPI_FUNC(PyObject*) _PyObject_LookupSpecial(PyObject *, PyObject *);
+
+extern int _PyObject_IsAbstract(PyObject *);
+
+PyAPI_FUNC(int) _PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method);
+extern PyObject* _PyObject_NextNotImplemented(PyObject *);
+
+// Pickle support.
+// Export for '_datetime' shared extension
+PyAPI_FUNC(PyObject*) _PyObject_GetState(PyObject *);
+
+/* C function call trampolines to mitigate bad function pointer casts.
+ *
+ * Typical native ABIs ignore additional arguments or fill in missing
+ * values with 0/NULL in function pointer cast. Compilers do not show
+ * warnings when a function pointer is explicitly casted to an
+ * incompatible type.
+ *
+ * Bad fpcasts are an issue in WebAssembly. WASM's indirect_call has strict
+ * function signature checks. Argument count, types, and return type must
+ * match.
+ *
+ * Third party code unintentionally rely on problematic fpcasts. The call
+ * trampoline mitigates common occurrences of bad fpcasts on Emscripten.
+ */
+#if !(defined(__EMSCRIPTEN__) && defined(PY_CALL_TRAMPOLINE))
+#define _PyCFunction_TrampolineCall(meth, self, args) \
+    (meth)((self), (args))
+#define _PyCFunctionWithKeywords_TrampolineCall(meth, self, args, kw) \
+    (meth)((self), (args), (kw))
+#endif // __EMSCRIPTEN__ && PY_CALL_TRAMPOLINE
+
+// Export these 2 symbols for '_pickle' shared extension
+PyAPI_DATA(PyTypeObject) _PyNone_Type;
+PyAPI_DATA(PyTypeObject) _PyNotImplemented_Type;
+
+// Maps Py_LT to Py_GT, ..., Py_GE to Py_LE.
+// Export for the stable ABI.
+PyAPI_DATA(int) _Py_SwappedOp[];
+
+extern void _Py_GetConstant_Init(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_OBJECT_H */
diff --git a/Include/internal/pycore_object_alloc.h b/Include/internal/pycore_object_alloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cc7a444bc93e7141702a944c6fbce4fa8bc0961
--- /dev/null
+++ b/Include/internal/pycore_object_alloc.h
@@ -0,0 +1,71 @@
+#ifndef Py_INTERNAL_OBJECT_ALLOC_H
+#define Py_INTERNAL_OBJECT_ALLOC_H
+
+#include "pycore_object.h"      // _PyType_HasFeature()
+#include "pycore_pystate.h"     // _PyThreadState_GET()
+#include "pycore_tstate.h"      // _PyThreadStateImpl
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef Py_GIL_DISABLED
+static inline mi_heap_t *
+_PyObject_GetAllocationHeap(_PyThreadStateImpl *tstate, PyTypeObject *tp)
+{
+    struct _mimalloc_thread_state *m = &tstate->mimalloc;
+    if (_PyType_HasFeature(tp, Py_TPFLAGS_PREHEADER)) {
+        return &m->heaps[_Py_MIMALLOC_HEAP_GC_PRE];
+    }
+    else if (_PyType_IS_GC(tp)) {
+        return &m->heaps[_Py_MIMALLOC_HEAP_GC];
+    }
+    else {
+        return &m->heaps[_Py_MIMALLOC_HEAP_OBJECT];
+    }
+}
+#endif
+
+// Sets the heap used for PyObject_Malloc(), PyObject_Realloc(), etc. calls in
+// Py_GIL_DISABLED builds. We use different heaps depending on if the object
+// supports GC and if it has a pre-header. We smuggle the choice of heap
+// through the _mimalloc_thread_state. In the default build, this simply
+// calls PyObject_Malloc().
+static inline void *
+_PyObject_MallocWithType(PyTypeObject *tp, size_t size)
+{
+#ifdef Py_GIL_DISABLED
+    _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
+    struct _mimalloc_thread_state *m = &tstate->mimalloc;
+    m->current_object_heap = _PyObject_GetAllocationHeap(tstate, tp);
+#endif
+    void *mem = PyObject_Malloc(size);
+#ifdef Py_GIL_DISABLED
+    m->current_object_heap = &m->heaps[_Py_MIMALLOC_HEAP_OBJECT];
+#endif
+    return mem;
+}
+
+static inline void *
+_PyObject_ReallocWithType(PyTypeObject *tp, void *ptr, size_t size)
+{
+#ifdef Py_GIL_DISABLED
+    _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
+    struct _mimalloc_thread_state *m = &tstate->mimalloc;
+    m->current_object_heap = _PyObject_GetAllocationHeap(tstate, tp);
+#endif
+    void *mem = PyObject_Realloc(ptr, size);
+#ifdef Py_GIL_DISABLED
+    m->current_object_heap = &m->heaps[_Py_MIMALLOC_HEAP_OBJECT];
+#endif
+    return mem;
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_OBJECT_ALLOC_H
diff --git a/Include/internal/pycore_object_stack.h b/Include/internal/pycore_object_stack.h
new file mode 100644
index 0000000000000000000000000000000000000000..639f3c0c0d0b766290f1f4dfed44115fc2aacfcc
--- /dev/null
+++ b/Include/internal/pycore_object_stack.h
@@ -0,0 +1,97 @@
+#ifndef Py_INTERNAL_OBJECT_STACK_H
+#define Py_INTERNAL_OBJECT_STACK_H
+
+#include "pycore_freelist.h"        // _PyFreeListState
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// _PyObjectStack is a stack of Python objects implemented as a linked list of
+// fixed size buffers.
+
+// Chosen so that _PyObjectStackChunk is a power-of-two size.
+#define _Py_OBJECT_STACK_CHUNK_SIZE 254
+
+typedef struct _PyObjectStackChunk {
+    struct _PyObjectStackChunk *prev;
+    Py_ssize_t n;
+    PyObject *objs[_Py_OBJECT_STACK_CHUNK_SIZE];
+} _PyObjectStackChunk;
+
+typedef struct _PyObjectStack {
+    _PyObjectStackChunk *head;
+} _PyObjectStack;
+
+
+extern _PyObjectStackChunk *
+_PyObjectStackChunk_New(void);
+
+extern void
+_PyObjectStackChunk_Free(_PyObjectStackChunk *);
+
+// Push an item onto the stack. Return -1 on allocation failure, 0 on success.
+static inline int
+_PyObjectStack_Push(_PyObjectStack *stack, PyObject *obj)
+{
+    _PyObjectStackChunk *buf = stack->head;
+    if (buf == NULL || buf->n == _Py_OBJECT_STACK_CHUNK_SIZE) {
+        buf = _PyObjectStackChunk_New();
+        if (buf == NULL) {
+            return -1;
+        }
+        buf->prev = stack->head;
+        buf->n = 0;
+        stack->head = buf;
+    }
+
+    assert(buf->n >= 0 && buf->n < _Py_OBJECT_STACK_CHUNK_SIZE);
+    buf->objs[buf->n] = obj;
+    buf->n++;
+    return 0;
+}
+
+// Pop the top item from the stack.  Return NULL if the stack is empty.
+static inline PyObject *
+_PyObjectStack_Pop(_PyObjectStack *stack)
+{
+    _PyObjectStackChunk *buf = stack->head;
+    if (buf == NULL) {
+        return NULL;
+    }
+    assert(buf->n > 0 && buf->n <= _Py_OBJECT_STACK_CHUNK_SIZE);
+    buf->n--;
+    PyObject *obj = buf->objs[buf->n];
+    if (buf->n == 0) {
+        stack->head = buf->prev;
+        _PyObjectStackChunk_Free(buf);
+    }
+    return obj;
+}
+
+static inline Py_ssize_t
+_PyObjectStack_Size(_PyObjectStack *stack)
+{
+    Py_ssize_t size = 0;
+    for (_PyObjectStackChunk *buf = stack->head; buf != NULL; buf = buf->prev) {
+        size += buf->n;
+    }
+    return size;
+}
+
+// Merge src into dst, leaving src empty
+extern void
+_PyObjectStack_Merge(_PyObjectStack *dst, _PyObjectStack *src);
+
+// Remove all items from the stack
+extern void
+_PyObjectStack_Clear(_PyObjectStack *stack);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_OBJECT_STACK_H
diff --git a/Include/internal/pycore_object_state.h b/Include/internal/pycore_object_state.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd7c9335b3e611c23b786d551d3fb9ab977c015f
--- /dev/null
+++ b/Include/internal/pycore_object_state.h
@@ -0,0 +1,41 @@
+#ifndef Py_INTERNAL_OBJECT_STATE_H
+#define Py_INTERNAL_OBJECT_STATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_freelist.h"      // _PyObject_freelists
+#include "pycore_hashtable.h"     // _Py_hashtable_t
+
+struct _py_object_runtime_state {
+#ifdef Py_REF_DEBUG
+    Py_ssize_t interpreter_leaks;
+#endif
+    int _not_used;
+};
+
+struct _py_object_state {
+#if !defined(Py_GIL_DISABLED)
+    struct _Py_object_freelists freelists;
+#endif
+#ifdef Py_REF_DEBUG
+    Py_ssize_t reftotal;
+#endif
+#ifdef Py_TRACE_REFS
+    // Hash table storing all objects. The key is the object pointer
+    // (PyObject*) and the value is always the number 1 (as uintptr_t).
+    // See _PyRefchain_IsTraced() and _PyRefchain_Trace() functions.
+    _Py_hashtable_t *refchain;
+#endif
+    int _not_used;
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_OBJECT_STATE_H */
diff --git a/Include/internal/pycore_obmalloc.h b/Include/internal/pycore_obmalloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..9140d8f08f0af1e677a5234187050737ccb71955
--- /dev/null
+++ b/Include/internal/pycore_obmalloc.h
@@ -0,0 +1,702 @@
+#ifndef Py_INTERNAL_OBMALLOC_H
+#define Py_INTERNAL_OBMALLOC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+typedef unsigned int pymem_uint;  /* assuming >= 16 bits */
+
+#undef  uint
+#define uint pymem_uint
+
+
+/* An object allocator for Python.
+
+   Here is an introduction to the layers of the Python memory architecture,
+   showing where the object allocator is actually used (layer +2), It is
+   called for every object allocation and deallocation (PyObject_New/Del),
+   unless the object-specific allocators implement a proprietary allocation
+   scheme (ex.: ints use a simple free list). This is also the place where
+   the cyclic garbage collector operates selectively on container objects.
+
+
+    Object-specific allocators
+    _____   ______   ______       ________
+   [ int ] [ dict ] [ list ] ... [ string ]       Python core         |
++3 | <----- Object-specific memory -----> | <-- Non-object memory --> |
+    _______________________________       |                           |
+   [   Python's object allocator   ]      |                           |
++2 | ####### Object memory ####### | <------ Internal buffers ------> |
+    ______________________________________________________________    |
+   [          Python's raw memory allocator (PyMem_ API)          ]   |
++1 | <----- Python memory (under PyMem manager's control) ------> |   |
+    __________________________________________________________________
+   [    Underlying general-purpose allocator (ex: C library malloc)   ]
+ 0 | <------ Virtual memory allocated for the python process -------> |
+
+   =========================================================================
+    _______________________________________________________________________
+   [                OS-specific Virtual Memory Manager (VMM)               ]
+-1 | <--- Kernel dynamic storage allocation & management (page-based) ---> |
+    __________________________________   __________________________________
+   [                                  ] [                                  ]
+-2 | <-- Physical memory: ROM/RAM --> | | <-- Secondary storage (swap) --> |
+
+*/
+/*==========================================================================*/
+
+/* A fast, special-purpose memory allocator for small blocks, to be used
+   on top of a general-purpose malloc -- heavily based on previous art. */
+
+/* Vladimir Marangozov -- August 2000 */
+
+/*
+ * "Memory management is where the rubber meets the road -- if we do the wrong
+ * thing at any level, the results will not be good. And if we don't make the
+ * levels work well together, we are in serious trouble." (1)
+ *
+ * (1) Paul R. Wilson, Mark S. Johnstone, Michael Neely, and David Boles,
+ *    "Dynamic Storage Allocation: A Survey and Critical Review",
+ *    in Proc. 1995 Int'l. Workshop on Memory Management, September 1995.
+ */
+
+/* #undef WITH_MEMORY_LIMITS */         /* disable mem limit checks  */
+
+/*==========================================================================*/
+
+/*
+ * Allocation strategy abstract:
+ *
+ * For small requests, the allocator sub-allocates <Big> blocks of memory.
+ * Requests greater than SMALL_REQUEST_THRESHOLD bytes are routed to the
+ * system's allocator.
+ *
+ * Small requests are grouped in size classes spaced 8 bytes apart, due
+ * to the required valid alignment of the returned address. Requests of
+ * a particular size are serviced from memory pools of 4K (one VMM page).
+ * Pools are fragmented on demand and contain free lists of blocks of one
+ * particular size class. In other words, there is a fixed-size allocator
+ * for each size class. Free pools are shared by the different allocators
+ * thus minimizing the space reserved for a particular size class.
+ *
+ * This allocation strategy is a variant of what is known as "simple
+ * segregated storage based on array of free lists". The main drawback of
+ * simple segregated storage is that we might end up with lot of reserved
+ * memory for the different free lists, which degenerate in time. To avoid
+ * this, we partition each free list in pools and we share dynamically the
+ * reserved space between all free lists. This technique is quite efficient
+ * for memory intensive programs which allocate mainly small-sized blocks.
+ *
+ * For small requests we have the following table:
+ *
+ * Request in bytes     Size of allocated block      Size class idx
+ * ----------------------------------------------------------------
+ *        1-8                     8                       0
+ *        9-16                   16                       1
+ *       17-24                   24                       2
+ *       25-32                   32                       3
+ *       33-40                   40                       4
+ *       41-48                   48                       5
+ *       49-56                   56                       6
+ *       57-64                   64                       7
+ *       65-72                   72                       8
+ *        ...                   ...                     ...
+ *      497-504                 504                      62
+ *      505-512                 512                      63
+ *
+ *      0, SMALL_REQUEST_THRESHOLD + 1 and up: routed to the underlying
+ *      allocator.
+ */
+
+/*==========================================================================*/
+
+/*
+ * -- Main tunable settings section --
+ */
+
+/*
+ * Alignment of addresses returned to the user. 8-bytes alignment works
+ * on most current architectures (with 32-bit or 64-bit address buses).
+ * The alignment value is also used for grouping small requests in size
+ * classes spaced ALIGNMENT bytes apart.
+ *
+ * You shouldn't change this unless you know what you are doing.
+ */
+
+#if SIZEOF_VOID_P > 4
+#define ALIGNMENT              16               /* must be 2^N */
+#define ALIGNMENT_SHIFT         4
+#else
+#define ALIGNMENT               8               /* must be 2^N */
+#define ALIGNMENT_SHIFT         3
+#endif
+
+/* Return the number of bytes in size class I, as a uint. */
+#define INDEX2SIZE(I) (((pymem_uint)(I) + 1) << ALIGNMENT_SHIFT)
+
+/*
+ * Max size threshold below which malloc requests are considered to be
+ * small enough in order to use preallocated memory pools. You can tune
+ * this value according to your application behaviour and memory needs.
+ *
+ * Note: a size threshold of 512 guarantees that newly created dictionaries
+ * will be allocated from preallocated memory pools on 64-bit.
+ *
+ * The following invariants must hold:
+ *      1) ALIGNMENT <= SMALL_REQUEST_THRESHOLD <= 512
+ *      2) SMALL_REQUEST_THRESHOLD is evenly divisible by ALIGNMENT
+ *
+ * Although not required, for better performance and space efficiency,
+ * it is recommended that SMALL_REQUEST_THRESHOLD is set to a power of 2.
+ */
+#define SMALL_REQUEST_THRESHOLD 512
+#define NB_SMALL_SIZE_CLASSES   (SMALL_REQUEST_THRESHOLD / ALIGNMENT)
+
+/*
+ * The system's VMM page size can be obtained on most unices with a
+ * getpagesize() call or deduced from various header files. To make
+ * things simpler, we assume that it is 4K, which is OK for most systems.
+ * It is probably better if this is the native page size, but it doesn't
+ * have to be.  In theory, if SYSTEM_PAGE_SIZE is larger than the native page
+ * size, then `POOL_ADDR(p)->arenaindex' could rarely cause a segmentation
+ * violation fault.  4K is apparently OK for all the platforms that python
+ * currently targets.
+ */
+#define SYSTEM_PAGE_SIZE        (4 * 1024)
+
+/*
+ * Maximum amount of memory managed by the allocator for small requests.
+ */
+#ifdef WITH_MEMORY_LIMITS
+#ifndef SMALL_MEMORY_LIMIT
+#define SMALL_MEMORY_LIMIT      (64 * 1024 * 1024)      /* 64 MB -- more? */
+#endif
+#endif
+
+#if !defined(WITH_PYMALLOC_RADIX_TREE)
+/* Use radix-tree to track arena memory regions, for address_in_range().
+ * Enable by default since it allows larger pool sizes.  Can be disabled
+ * using -DWITH_PYMALLOC_RADIX_TREE=0 */
+#define WITH_PYMALLOC_RADIX_TREE 1
+#endif
+
+#if SIZEOF_VOID_P > 4
+/* on 64-bit platforms use larger pools and arenas if we can */
+#define USE_LARGE_ARENAS
+#if WITH_PYMALLOC_RADIX_TREE
+/* large pools only supported if radix-tree is enabled */
+#define USE_LARGE_POOLS
+#endif
+#endif
+
+/*
+ * The allocator sub-allocates <Big> blocks of memory (called arenas) aligned
+ * on a page boundary. This is a reserved virtual address space for the
+ * current process (obtained through a malloc()/mmap() call). In no way this
+ * means that the memory arenas will be used entirely. A malloc(<Big>) is
+ * usually an address range reservation for <Big> bytes, unless all pages within
+ * this space are referenced subsequently. So malloc'ing big blocks and not
+ * using them does not mean "wasting memory". It's an addressable range
+ * wastage...
+ *
+ * Arenas are allocated with mmap() on systems supporting anonymous memory
+ * mappings to reduce heap fragmentation.
+ */
+#ifdef USE_LARGE_ARENAS
+#define ARENA_BITS              20                    /* 1 MiB */
+#else
+#define ARENA_BITS              18                    /* 256 KiB */
+#endif
+#define ARENA_SIZE              (1 << ARENA_BITS)
+#define ARENA_SIZE_MASK         (ARENA_SIZE - 1)
+
+#ifdef WITH_MEMORY_LIMITS
+#define MAX_ARENAS              (SMALL_MEMORY_LIMIT / ARENA_SIZE)
+#endif
+
+/*
+ * Size of the pools used for small blocks.  Must be a power of 2.
+ */
+#ifdef USE_LARGE_POOLS
+#define POOL_BITS               14                  /* 16 KiB */
+#else
+#define POOL_BITS               12                  /* 4 KiB */
+#endif
+#define POOL_SIZE               (1 << POOL_BITS)
+#define POOL_SIZE_MASK          (POOL_SIZE - 1)
+
+#if !WITH_PYMALLOC_RADIX_TREE
+#if POOL_SIZE != SYSTEM_PAGE_SIZE
+#   error "pool size must be equal to system page size"
+#endif
+#endif
+
+#define MAX_POOLS_IN_ARENA  (ARENA_SIZE / POOL_SIZE)
+#if MAX_POOLS_IN_ARENA * POOL_SIZE != ARENA_SIZE
+#   error "arena size not an exact multiple of pool size"
+#endif
+
+/*
+ * -- End of tunable settings section --
+ */
+
+/*==========================================================================*/
+
+/* When you say memory, my mind reasons in terms of (pointers to) blocks */
+typedef uint8_t pymem_block;
+
+/* Pool for small blocks. */
+struct pool_header {
+    union { pymem_block *_padding;
+            uint count; } ref;          /* number of allocated blocks    */
+    pymem_block *freeblock;             /* pool's free list head         */
+    struct pool_header *nextpool;       /* next pool of this size class  */
+    struct pool_header *prevpool;       /* previous pool       ""        */
+    uint arenaindex;                    /* index into arenas of base adr */
+    uint szidx;                         /* block size class index        */
+    uint nextoffset;                    /* bytes to virgin block         */
+    uint maxnextoffset;                 /* largest valid nextoffset      */
+};
+
+typedef struct pool_header *poolp;
+
+/* Record keeping for arenas. */
+struct arena_object {
+    /* The address of the arena, as returned by malloc.  Note that 0
+     * will never be returned by a successful malloc, and is used
+     * here to mark an arena_object that doesn't correspond to an
+     * allocated arena.
+     */
+    uintptr_t address;
+
+    /* Pool-aligned pointer to the next pool to be carved off. */
+    pymem_block* pool_address;
+
+    /* The number of available pools in the arena:  free pools + never-
+     * allocated pools.
+     */
+    uint nfreepools;
+
+    /* The total number of pools in the arena, whether or not available. */
+    uint ntotalpools;
+
+    /* Singly-linked list of available pools. */
+    struct pool_header* freepools;
+
+    /* Whenever this arena_object is not associated with an allocated
+     * arena, the nextarena member is used to link all unassociated
+     * arena_objects in the singly-linked `unused_arena_objects` list.
+     * The prevarena member is unused in this case.
+     *
+     * When this arena_object is associated with an allocated arena
+     * with at least one available pool, both members are used in the
+     * doubly-linked `usable_arenas` list, which is maintained in
+     * increasing order of `nfreepools` values.
+     *
+     * Else this arena_object is associated with an allocated arena
+     * all of whose pools are in use.  `nextarena` and `prevarena`
+     * are both meaningless in this case.
+     */
+    struct arena_object* nextarena;
+    struct arena_object* prevarena;
+};
+
+#define POOL_OVERHEAD   _Py_SIZE_ROUND_UP(sizeof(struct pool_header), ALIGNMENT)
+
+#define DUMMY_SIZE_IDX          0xffff  /* size class of newly cached pools */
+
+/* Round pointer P down to the closest pool-aligned address <= P, as a poolp */
+#define POOL_ADDR(P) ((poolp)_Py_ALIGN_DOWN((P), POOL_SIZE))
+
+/* Return total number of blocks in pool of size index I, as a uint. */
+#define NUMBLOCKS(I) ((pymem_uint)(POOL_SIZE - POOL_OVERHEAD) / INDEX2SIZE(I))
+
+/*==========================================================================*/
+
+/*
+ * Pool table -- headed, circular, doubly-linked lists of partially used pools.
+
+This is involved.  For an index i, usedpools[i+i] is the header for a list of
+all partially used pools holding small blocks with "size class idx" i. So
+usedpools[0] corresponds to blocks of size 8, usedpools[2] to blocks of size
+16, and so on:  index 2*i <-> blocks of size (i+1)<<ALIGNMENT_SHIFT.
+
+Pools are carved off an arena's highwater mark (an arena_object's pool_address
+member) as needed.  Once carved off, a pool is in one of three states forever
+after:
+
+used == partially used, neither empty nor full
+    At least one block in the pool is currently allocated, and at least one
+    block in the pool is not currently allocated (note this implies a pool
+    has room for at least two blocks).
+    This is a pool's initial state, as a pool is created only when malloc
+    needs space.
+    The pool holds blocks of a fixed size, and is in the circular list headed
+    at usedpools[i] (see above).  It's linked to the other used pools of the
+    same size class via the pool_header's nextpool and prevpool members.
+    If all but one block is currently allocated, a malloc can cause a
+    transition to the full state.  If all but one block is not currently
+    allocated, a free can cause a transition to the empty state.
+
+full == all the pool's blocks are currently allocated
+    On transition to full, a pool is unlinked from its usedpools[] list.
+    It's not linked to from anything then anymore, and its nextpool and
+    prevpool members are meaningless until it transitions back to used.
+    A free of a block in a full pool puts the pool back in the used state.
+    Then it's linked in at the front of the appropriate usedpools[] list, so
+    that the next allocation for its size class will reuse the freed block.
+
+empty == all the pool's blocks are currently available for allocation
+    On transition to empty, a pool is unlinked from its usedpools[] list,
+    and linked to the front of its arena_object's singly-linked freepools list,
+    via its nextpool member.  The prevpool member has no meaning in this case.
+    Empty pools have no inherent size class:  the next time a malloc finds
+    an empty list in usedpools[], it takes the first pool off of freepools.
+    If the size class needed happens to be the same as the size class the pool
+    last had, some pool initialization can be skipped.
+
+
+Block Management
+
+Blocks within pools are again carved out as needed.  pool->freeblock points to
+the start of a singly-linked list of free blocks within the pool.  When a
+block is freed, it's inserted at the front of its pool's freeblock list.  Note
+that the available blocks in a pool are *not* linked all together when a pool
+is initialized.  Instead only "the first two" (lowest addresses) blocks are
+set up, returning the first such block, and setting pool->freeblock to a
+one-block list holding the second such block.  This is consistent with that
+pymalloc strives at all levels (arena, pool, and block) never to touch a piece
+of memory until it's actually needed.
+
+So long as a pool is in the used state, we're certain there *is* a block
+available for allocating, and pool->freeblock is not NULL.  If pool->freeblock
+points to the end of the free list before we've carved the entire pool into
+blocks, that means we simply haven't yet gotten to one of the higher-address
+blocks.  The offset from the pool_header to the start of "the next" virgin
+block is stored in the pool_header nextoffset member, and the largest value
+of nextoffset that makes sense is stored in the maxnextoffset member when a
+pool is initialized.  All the blocks in a pool have been passed out at least
+once when and only when nextoffset > maxnextoffset.
+
+
+Major obscurity:  While the usedpools vector is declared to have poolp
+entries, it doesn't really.  It really contains two pointers per (conceptual)
+poolp entry, the nextpool and prevpool members of a pool_header.  The
+excruciating initialization code below fools C so that
+
+    usedpool[i+i]
+
+"acts like" a genuine poolp, but only so long as you only reference its
+nextpool and prevpool members.  The "- 2*sizeof(pymem_block *)" gibberish is
+compensating for that a pool_header's nextpool and prevpool members
+immediately follow a pool_header's first two members:
+
+    union { pymem_block *_padding;
+            uint count; } ref;
+    pymem_block *freeblock;
+
+each of which consume sizeof(pymem_block *) bytes.  So what usedpools[i+i] really
+contains is a fudged-up pointer p such that *if* C believes it's a poolp
+pointer, then p->nextpool and p->prevpool are both p (meaning that the headed
+circular list is empty).
+
+It's unclear why the usedpools setup is so convoluted.  It could be to
+minimize the amount of cache required to hold this heavily-referenced table
+(which only *needs* the two interpool pointer members of a pool_header). OTOH,
+referencing code has to remember to "double the index" and doing so isn't
+free, usedpools[0] isn't a strictly legal pointer, and we're crucially relying
+on that C doesn't insert any padding anywhere in a pool_header at or before
+the prevpool member.
+**************************************************************************** */
+
+#define OBMALLOC_USED_POOLS_SIZE (2 * ((NB_SMALL_SIZE_CLASSES + 7) / 8) * 8)
+
+struct _obmalloc_pools {
+    poolp used[OBMALLOC_USED_POOLS_SIZE];
+};
+
+
+/*==========================================================================
+Arena management.
+
+`arenas` is a vector of arena_objects.  It contains maxarenas entries, some of
+which may not be currently used (== they're arena_objects that aren't
+currently associated with an allocated arena).  Note that arenas proper are
+separately malloc'ed.
+
+Prior to Python 2.5, arenas were never free()'ed.  Starting with Python 2.5,
+we do try to free() arenas, and use some mild heuristic strategies to increase
+the likelihood that arenas eventually can be freed.
+
+unused_arena_objects
+
+    This is a singly-linked list of the arena_objects that are currently not
+    being used (no arena is associated with them).  Objects are taken off the
+    head of the list in new_arena(), and are pushed on the head of the list in
+    PyObject_Free() when the arena is empty.  Key invariant:  an arena_object
+    is on this list if and only if its .address member is 0.
+
+usable_arenas
+
+    This is a doubly-linked list of the arena_objects associated with arenas
+    that have pools available.  These pools are either waiting to be reused,
+    or have not been used before.  The list is sorted to have the most-
+    allocated arenas first (ascending order based on the nfreepools member).
+    This means that the next allocation will come from a heavily used arena,
+    which gives the nearly empty arenas a chance to be returned to the system.
+    In my unscientific tests this dramatically improved the number of arenas
+    that could be freed.
+
+Note that an arena_object associated with an arena all of whose pools are
+currently in use isn't on either list.
+
+Changed in Python 3.8:  keeping usable_arenas sorted by number of free pools
+used to be done by one-at-a-time linear search when an arena's number of
+free pools changed.  That could, overall, consume time quadratic in the
+number of arenas.  That didn't really matter when there were only a few
+hundred arenas (typical!), but could be a timing disaster when there were
+hundreds of thousands.  See bpo-37029.
+
+Now we have a vector of "search fingers" to eliminate the need to search:
+nfp2lasta[nfp] returns the last ("rightmost") arena in usable_arenas
+with nfp free pools.  This is NULL if and only if there is no arena with
+nfp free pools in usable_arenas.
+*/
+
+/* How many arena_objects do we initially allocate?
+ * 16 = can allocate 16 arenas = 16 * ARENA_SIZE = 4MB before growing the
+ * `arenas` vector.
+ */
+#define INITIAL_ARENA_OBJECTS 16
+
+struct _obmalloc_mgmt {
+    /* Array of objects used to track chunks of memory (arenas). */
+    struct arena_object* arenas;
+    /* Number of slots currently allocated in the `arenas` vector. */
+    uint maxarenas;
+
+    /* The head of the singly-linked, NULL-terminated list of available
+     * arena_objects.
+     */
+    struct arena_object* unused_arena_objects;
+
+    /* The head of the doubly-linked, NULL-terminated at each end, list of
+     * arena_objects associated with arenas that have pools available.
+     */
+    struct arena_object* usable_arenas;
+
+    /* nfp2lasta[nfp] is the last arena in usable_arenas with nfp free pools */
+    struct arena_object* nfp2lasta[MAX_POOLS_IN_ARENA + 1];
+
+    /* Number of arenas allocated that haven't been free()'d. */
+    size_t narenas_currently_allocated;
+
+    /* Total number of times malloc() called to allocate an arena. */
+    size_t ntimes_arena_allocated;
+    /* High water mark (max value ever seen) for narenas_currently_allocated. */
+    size_t narenas_highwater;
+
+    Py_ssize_t raw_allocated_blocks;
+};
+
+
+#if WITH_PYMALLOC_RADIX_TREE
+/*==========================================================================*/
+/* radix tree for tracking arena usage.  If enabled, used to implement
+   address_in_range().
+
+   memory address bit allocation for keys
+
+   64-bit pointers, IGNORE_BITS=0 and 2^20 arena size:
+     15 -> MAP_TOP_BITS
+     15 -> MAP_MID_BITS
+     14 -> MAP_BOT_BITS
+     20 -> ideal aligned arena
+   ----
+     64
+
+   64-bit pointers, IGNORE_BITS=16, and 2^20 arena size:
+     16 -> IGNORE_BITS
+     10 -> MAP_TOP_BITS
+     10 -> MAP_MID_BITS
+      8 -> MAP_BOT_BITS
+     20 -> ideal aligned arena
+   ----
+     64
+
+   32-bit pointers and 2^18 arena size:
+     14 -> MAP_BOT_BITS
+     18 -> ideal aligned arena
+   ----
+     32
+
+*/
+
+#if SIZEOF_VOID_P == 8
+
+/* number of bits in a pointer */
+#define POINTER_BITS 64
+
+/* High bits of memory addresses that will be ignored when indexing into the
+ * radix tree.  Setting this to zero is the safe default.  For most 64-bit
+ * machines, setting this to 16 would be safe.  The kernel would not give
+ * user-space virtual memory addresses that have significant information in
+ * those high bits.  The main advantage to setting IGNORE_BITS > 0 is that less
+ * virtual memory will be used for the top and middle radix tree arrays.  Those
+ * arrays are allocated in the BSS segment and so will typically consume real
+ * memory only if actually accessed.
+ */
+#define IGNORE_BITS 0
+
+/* use the top and mid layers of the radix tree */
+#define USE_INTERIOR_NODES
+
+#elif SIZEOF_VOID_P == 4
+
+#define POINTER_BITS 32
+#define IGNORE_BITS 0
+
+#else
+
+ /* Currently this code works for 64-bit or 32-bit pointers only.  */
+#error "obmalloc radix tree requires 64-bit or 32-bit pointers."
+
+#endif /* SIZEOF_VOID_P */
+
+/* arena_coverage_t members require this to be true  */
+#if ARENA_BITS >= 32
+#   error "arena size must be < 2^32"
+#endif
+
+/* the lower bits of the address that are not ignored */
+#define ADDRESS_BITS (POINTER_BITS - IGNORE_BITS)
+
+#ifdef USE_INTERIOR_NODES
+/* number of bits used for MAP_TOP and MAP_MID nodes */
+#define INTERIOR_BITS ((ADDRESS_BITS - ARENA_BITS + 2) / 3)
+#else
+#define INTERIOR_BITS 0
+#endif
+
+#define MAP_TOP_BITS INTERIOR_BITS
+#define MAP_TOP_LENGTH (1 << MAP_TOP_BITS)
+#define MAP_TOP_MASK (MAP_TOP_LENGTH - 1)
+
+#define MAP_MID_BITS INTERIOR_BITS
+#define MAP_MID_LENGTH (1 << MAP_MID_BITS)
+#define MAP_MID_MASK (MAP_MID_LENGTH - 1)
+
+#define MAP_BOT_BITS (ADDRESS_BITS - ARENA_BITS - 2*INTERIOR_BITS)
+#define MAP_BOT_LENGTH (1 << MAP_BOT_BITS)
+#define MAP_BOT_MASK (MAP_BOT_LENGTH - 1)
+
+#define MAP_BOT_SHIFT ARENA_BITS
+#define MAP_MID_SHIFT (MAP_BOT_BITS + MAP_BOT_SHIFT)
+#define MAP_TOP_SHIFT (MAP_MID_BITS + MAP_MID_SHIFT)
+
+#define AS_UINT(p) ((uintptr_t)(p))
+#define MAP_BOT_INDEX(p) ((AS_UINT(p) >> MAP_BOT_SHIFT) & MAP_BOT_MASK)
+#define MAP_MID_INDEX(p) ((AS_UINT(p) >> MAP_MID_SHIFT) & MAP_MID_MASK)
+#define MAP_TOP_INDEX(p) ((AS_UINT(p) >> MAP_TOP_SHIFT) & MAP_TOP_MASK)
+
+#if IGNORE_BITS > 0
+/* Return the ignored part of the pointer address.  Those bits should be same
+ * for all valid pointers if IGNORE_BITS is set correctly.
+ */
+#define HIGH_BITS(p) (AS_UINT(p) >> ADDRESS_BITS)
+#else
+#define HIGH_BITS(p) 0
+#endif
+
+
+/* This is the leaf of the radix tree.  See arena_map_mark_used() for the
+ * meaning of these members. */
+typedef struct {
+    int32_t tail_hi;
+    int32_t tail_lo;
+} arena_coverage_t;
+
+typedef struct arena_map_bot {
+    /* The members tail_hi and tail_lo are accessed together.  So, it
+     * better to have them as an array of structs, rather than two
+     * arrays.
+     */
+    arena_coverage_t arenas[MAP_BOT_LENGTH];
+} arena_map_bot_t;
+
+#ifdef USE_INTERIOR_NODES
+typedef struct arena_map_mid {
+    struct arena_map_bot *ptrs[MAP_MID_LENGTH];
+} arena_map_mid_t;
+
+typedef struct arena_map_top {
+    struct arena_map_mid *ptrs[MAP_TOP_LENGTH];
+} arena_map_top_t;
+#endif
+
+struct _obmalloc_usage {
+    /* The root of radix tree.  Note that by initializing like this, the memory
+     * should be in the BSS.  The OS will only memory map pages as the MAP_MID
+     * nodes get used (OS pages are demand loaded as needed).
+     */
+#ifdef USE_INTERIOR_NODES
+    arena_map_top_t arena_map_root;
+    /* accounting for number of used interior nodes */
+    int arena_map_mid_count;
+    int arena_map_bot_count;
+#else
+    arena_map_bot_t arena_map_root;
+#endif
+};
+
+#endif /* WITH_PYMALLOC_RADIX_TREE */
+
+
+struct _obmalloc_global_state {
+    int dump_debug_stats;
+    Py_ssize_t interpreter_leaks;
+};
+
+struct _obmalloc_state {
+    struct _obmalloc_pools pools;
+    struct _obmalloc_mgmt mgmt;
+#if WITH_PYMALLOC_RADIX_TREE
+    struct _obmalloc_usage usage;
+#endif
+};
+
+
+#undef  uint
+
+
+/* Allocate memory directly from the O/S virtual memory system,
+ * where supported. Otherwise fallback on malloc */
+void *_PyObject_VirtualAlloc(size_t size);
+void _PyObject_VirtualFree(void *, size_t size);
+
+
+/* This function returns the number of allocated memory blocks, regardless of size */
+extern Py_ssize_t _Py_GetGlobalAllocatedBlocks(void);
+#define _Py_GetAllocatedBlocks() \
+    _Py_GetGlobalAllocatedBlocks()
+extern Py_ssize_t _PyInterpreterState_GetAllocatedBlocks(PyInterpreterState *);
+extern void _PyInterpreterState_FinalizeAllocatedBlocks(PyInterpreterState *);
+extern int _PyMem_init_obmalloc(PyInterpreterState *interp);
+extern bool _PyMem_obmalloc_state_on_heap(PyInterpreterState *interp);
+
+
+#ifdef WITH_PYMALLOC
+// Export the symbol for the 3rd party 'guppy3' project
+PyAPI_FUNC(int) _PyObject_DebugMallocStats(FILE *out);
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_OBMALLOC_H
diff --git a/Include/internal/pycore_obmalloc_init.h b/Include/internal/pycore_obmalloc_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6811b7aeca73c196b16519cb12a902af442f998
--- /dev/null
+++ b/Include/internal/pycore_obmalloc_init.h
@@ -0,0 +1,66 @@
+#ifndef Py_INTERNAL_OBMALLOC_INIT_H
+#define Py_INTERNAL_OBMALLOC_INIT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/****************************************************/
+/* the default object allocator's state initializer */
+
+#define PTA(pools, x) \
+    ((poolp )((uint8_t *)&(pools.used[2*(x)]) - 2*sizeof(pymem_block *)))
+#define PT(p, x)   PTA(p, x), PTA(p, x)
+
+#define PT_8(p, start) \
+    PT(p, start), \
+    PT(p, start+1), \
+    PT(p, start+2), \
+    PT(p, start+3), \
+    PT(p, start+4), \
+    PT(p, start+5), \
+    PT(p, start+6), \
+    PT(p, start+7)
+
+#if NB_SMALL_SIZE_CLASSES <= 8
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0) }
+#elif NB_SMALL_SIZE_CLASSES <= 16
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8) }
+#elif NB_SMALL_SIZE_CLASSES <= 24
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16) }
+#elif NB_SMALL_SIZE_CLASSES <= 32
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16), PT_8(p, 24) }
+#elif NB_SMALL_SIZE_CLASSES <= 40
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16), PT_8(p, 24), PT_8(p, 32) }
+#elif NB_SMALL_SIZE_CLASSES <= 48
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16), PT_8(p, 24), PT_8(p, 32), PT_8(p, 40) }
+#elif NB_SMALL_SIZE_CLASSES <= 56
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16), PT_8(p, 24), PT_8(p, 32), PT_8(p, 40), PT_8(p, 48) }
+#elif NB_SMALL_SIZE_CLASSES <= 64
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16), PT_8(p, 24), PT_8(p, 32), PT_8(p, 40), PT_8(p, 48), PT_8(p, 56) }
+#else
+#  error "NB_SMALL_SIZE_CLASSES should be less than 64"
+#endif
+
+#define _obmalloc_global_state_INIT \
+    { \
+        .dump_debug_stats = -1, \
+    }
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_OBMALLOC_INIT_H
diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbba0bbbf4b0aa914a8da3a145205d88e270ec1f
--- /dev/null
+++ b/Include/internal/pycore_opcode_metadata.h
@@ -0,0 +1,1922 @@
+// This file is generated by Tools/cases_generator/opcode_metadata_generator.py
+// from:
+//   Python/bytecodes.c
+// Do not edit!
+
+#ifndef Py_CORE_OPCODE_METADATA_H
+#define Py_CORE_OPCODE_METADATA_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <stdbool.h>              // bool
+#include "opcode_ids.h"
+
+
+#define IS_PSEUDO_INSTR(OP)  ( \
+    ((OP) == LOAD_CLOSURE) || \
+    ((OP) == STORE_FAST_MAYBE_NULL) || \
+    ((OP) == LOAD_SUPER_METHOD) || \
+    ((OP) == LOAD_ZERO_SUPER_METHOD) || \
+    ((OP) == LOAD_ZERO_SUPER_ATTR) || \
+    ((OP) == LOAD_METHOD) || \
+    ((OP) == JUMP) || \
+    ((OP) == JUMP_NO_INTERRUPT) || \
+    ((OP) == SETUP_FINALLY) || \
+    ((OP) == SETUP_CLEANUP) || \
+    ((OP) == SETUP_WITH) || \
+    ((OP) == POP_BLOCK) || \
+    0)
+
+#include "pycore_uop_ids.h"
+extern int _PyOpcode_num_popped(int opcode, int oparg);
+#ifdef NEED_OPCODE_METADATA
+int _PyOpcode_num_popped(int opcode, int oparg)  {
+    switch(opcode) {
+        case BEFORE_ASYNC_WITH:
+            return 1;
+        case BEFORE_WITH:
+            return 1;
+        case BINARY_OP:
+            return 2;
+        case BINARY_OP_ADD_FLOAT:
+            return 2;
+        case BINARY_OP_ADD_INT:
+            return 2;
+        case BINARY_OP_ADD_UNICODE:
+            return 2;
+        case BINARY_OP_INPLACE_ADD_UNICODE:
+            return 2;
+        case BINARY_OP_MULTIPLY_FLOAT:
+            return 2;
+        case BINARY_OP_MULTIPLY_INT:
+            return 2;
+        case BINARY_OP_SUBTRACT_FLOAT:
+            return 2;
+        case BINARY_OP_SUBTRACT_INT:
+            return 2;
+        case BINARY_SLICE:
+            return 3;
+        case BINARY_SUBSCR:
+            return 2;
+        case BINARY_SUBSCR_DICT:
+            return 2;
+        case BINARY_SUBSCR_GETITEM:
+            return 2;
+        case BINARY_SUBSCR_LIST_INT:
+            return 2;
+        case BINARY_SUBSCR_STR_INT:
+            return 2;
+        case BINARY_SUBSCR_TUPLE_INT:
+            return 2;
+        case BUILD_CONST_KEY_MAP:
+            return 1 + oparg;
+        case BUILD_LIST:
+            return oparg;
+        case BUILD_MAP:
+            return oparg*2;
+        case BUILD_SET:
+            return oparg;
+        case BUILD_SLICE:
+            return 2 + ((oparg == 3) ? 1 : 0);
+        case BUILD_STRING:
+            return oparg;
+        case BUILD_TUPLE:
+            return oparg;
+        case CACHE:
+            return 0;
+        case CALL:
+            return 2 + oparg;
+        case CALL_ALLOC_AND_ENTER_INIT:
+            return 2 + oparg;
+        case CALL_BOUND_METHOD_EXACT_ARGS:
+            return 2 + oparg;
+        case CALL_BOUND_METHOD_GENERAL:
+            return 2 + oparg;
+        case CALL_BUILTIN_CLASS:
+            return 2 + oparg;
+        case CALL_BUILTIN_FAST:
+            return 2 + oparg;
+        case CALL_BUILTIN_FAST_WITH_KEYWORDS:
+            return 2 + oparg;
+        case CALL_BUILTIN_O:
+            return 2 + oparg;
+        case CALL_FUNCTION_EX:
+            return 3 + (oparg & 1);
+        case CALL_INTRINSIC_1:
+            return 1;
+        case CALL_INTRINSIC_2:
+            return 2;
+        case CALL_ISINSTANCE:
+            return 2 + oparg;
+        case CALL_KW:
+            return 3 + oparg;
+        case CALL_LEN:
+            return 2 + oparg;
+        case CALL_LIST_APPEND:
+            return 3;
+        case CALL_METHOD_DESCRIPTOR_FAST:
+            return 2 + oparg;
+        case CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS:
+            return 2 + oparg;
+        case CALL_METHOD_DESCRIPTOR_NOARGS:
+            return 2 + oparg;
+        case CALL_METHOD_DESCRIPTOR_O:
+            return 2 + oparg;
+        case CALL_NON_PY_GENERAL:
+            return 2 + oparg;
+        case CALL_PY_EXACT_ARGS:
+            return 2 + oparg;
+        case CALL_PY_GENERAL:
+            return 2 + oparg;
+        case CALL_STR_1:
+            return 3;
+        case CALL_TUPLE_1:
+            return 3;
+        case CALL_TYPE_1:
+            return 3;
+        case CHECK_EG_MATCH:
+            return 2;
+        case CHECK_EXC_MATCH:
+            return 2;
+        case CLEANUP_THROW:
+            return 3;
+        case COMPARE_OP:
+            return 2;
+        case COMPARE_OP_FLOAT:
+            return 2;
+        case COMPARE_OP_INT:
+            return 2;
+        case COMPARE_OP_STR:
+            return 2;
+        case CONTAINS_OP:
+            return 2;
+        case CONTAINS_OP_DICT:
+            return 2;
+        case CONTAINS_OP_SET:
+            return 2;
+        case CONVERT_VALUE:
+            return 1;
+        case COPY:
+            return 1 + (oparg-1);
+        case COPY_FREE_VARS:
+            return 0;
+        case DELETE_ATTR:
+            return 1;
+        case DELETE_DEREF:
+            return 0;
+        case DELETE_FAST:
+            return 0;
+        case DELETE_GLOBAL:
+            return 0;
+        case DELETE_NAME:
+            return 0;
+        case DELETE_SUBSCR:
+            return 2;
+        case DICT_MERGE:
+            return 5 + (oparg - 1);
+        case DICT_UPDATE:
+            return 2 + (oparg - 1);
+        case END_ASYNC_FOR:
+            return 2;
+        case END_FOR:
+            return 1;
+        case END_SEND:
+            return 2;
+        case ENTER_EXECUTOR:
+            return 0;
+        case EXIT_INIT_CHECK:
+            return 1;
+        case EXTENDED_ARG:
+            return 0;
+        case FORMAT_SIMPLE:
+            return 1;
+        case FORMAT_WITH_SPEC:
+            return 2;
+        case FOR_ITER:
+            return 1;
+        case FOR_ITER_GEN:
+            return 1;
+        case FOR_ITER_LIST:
+            return 1;
+        case FOR_ITER_RANGE:
+            return 1;
+        case FOR_ITER_TUPLE:
+            return 1;
+        case GET_AITER:
+            return 1;
+        case GET_ANEXT:
+            return 1;
+        case GET_AWAITABLE:
+            return 1;
+        case GET_ITER:
+            return 1;
+        case GET_LEN:
+            return 1;
+        case GET_YIELD_FROM_ITER:
+            return 1;
+        case IMPORT_FROM:
+            return 1;
+        case IMPORT_NAME:
+            return 2;
+        case INSTRUMENTED_CALL:
+            return 0;
+        case INSTRUMENTED_CALL_FUNCTION_EX:
+            return 0;
+        case INSTRUMENTED_CALL_KW:
+            return 0;
+        case INSTRUMENTED_END_FOR:
+            return 2;
+        case INSTRUMENTED_END_SEND:
+            return 2;
+        case INSTRUMENTED_FOR_ITER:
+            return 0;
+        case INSTRUMENTED_INSTRUCTION:
+            return 0;
+        case INSTRUMENTED_JUMP_BACKWARD:
+            return 0;
+        case INSTRUMENTED_JUMP_FORWARD:
+            return 0;
+        case INSTRUMENTED_LOAD_SUPER_ATTR:
+            return 3;
+        case INSTRUMENTED_POP_JUMP_IF_FALSE:
+            return 0;
+        case INSTRUMENTED_POP_JUMP_IF_NONE:
+            return 0;
+        case INSTRUMENTED_POP_JUMP_IF_NOT_NONE:
+            return 0;
+        case INSTRUMENTED_POP_JUMP_IF_TRUE:
+            return 0;
+        case INSTRUMENTED_RESUME:
+            return 0;
+        case INSTRUMENTED_RETURN_CONST:
+            return 0;
+        case INSTRUMENTED_RETURN_VALUE:
+            return 1;
+        case INSTRUMENTED_YIELD_VALUE:
+            return 1;
+        case INTERPRETER_EXIT:
+            return 1;
+        case IS_OP:
+            return 2;
+        case JUMP_BACKWARD:
+            return 0;
+        case JUMP_BACKWARD_NO_INTERRUPT:
+            return 0;
+        case JUMP_FORWARD:
+            return 0;
+        case LIST_APPEND:
+            return 2 + (oparg-1);
+        case LIST_EXTEND:
+            return 2 + (oparg-1);
+        case LOAD_ASSERTION_ERROR:
+            return 0;
+        case LOAD_ATTR:
+            return 1;
+        case LOAD_ATTR_CLASS:
+            return 1;
+        case LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN:
+            return 1;
+        case LOAD_ATTR_INSTANCE_VALUE:
+            return 1;
+        case LOAD_ATTR_METHOD_LAZY_DICT:
+            return 1;
+        case LOAD_ATTR_METHOD_NO_DICT:
+            return 1;
+        case LOAD_ATTR_METHOD_WITH_VALUES:
+            return 1;
+        case LOAD_ATTR_MODULE:
+            return 1;
+        case LOAD_ATTR_NONDESCRIPTOR_NO_DICT:
+            return 1;
+        case LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES:
+            return 1;
+        case LOAD_ATTR_PROPERTY:
+            return 1;
+        case LOAD_ATTR_SLOT:
+            return 1;
+        case LOAD_ATTR_WITH_HINT:
+            return 1;
+        case LOAD_BUILD_CLASS:
+            return 0;
+        case LOAD_CONST:
+            return 0;
+        case LOAD_DEREF:
+            return 0;
+        case LOAD_FAST:
+            return 0;
+        case LOAD_FAST_AND_CLEAR:
+            return 0;
+        case LOAD_FAST_CHECK:
+            return 0;
+        case LOAD_FAST_LOAD_FAST:
+            return 0;
+        case LOAD_FROM_DICT_OR_DEREF:
+            return 1;
+        case LOAD_FROM_DICT_OR_GLOBALS:
+            return 1;
+        case LOAD_GLOBAL:
+            return 0;
+        case LOAD_GLOBAL_BUILTIN:
+            return 0;
+        case LOAD_GLOBAL_MODULE:
+            return 0;
+        case LOAD_LOCALS:
+            return 0;
+        case LOAD_NAME:
+            return 0;
+        case LOAD_SUPER_ATTR:
+            return 3;
+        case LOAD_SUPER_ATTR_ATTR:
+            return 3;
+        case LOAD_SUPER_ATTR_METHOD:
+            return 3;
+        case MAKE_CELL:
+            return 0;
+        case MAKE_FUNCTION:
+            return 1;
+        case MAP_ADD:
+            return 3 + (oparg - 1);
+        case MATCH_CLASS:
+            return 3;
+        case MATCH_KEYS:
+            return 2;
+        case MATCH_MAPPING:
+            return 1;
+        case MATCH_SEQUENCE:
+            return 1;
+        case NOP:
+            return 0;
+        case POP_EXCEPT:
+            return 1;
+        case POP_JUMP_IF_FALSE:
+            return 1;
+        case POP_JUMP_IF_NONE:
+            return 1;
+        case POP_JUMP_IF_NOT_NONE:
+            return 1;
+        case POP_JUMP_IF_TRUE:
+            return 1;
+        case POP_TOP:
+            return 1;
+        case PUSH_EXC_INFO:
+            return 1;
+        case PUSH_NULL:
+            return 0;
+        case RAISE_VARARGS:
+            return oparg;
+        case RERAISE:
+            return 1 + oparg;
+        case RESERVED:
+            return 0;
+        case RESUME:
+            return 0;
+        case RESUME_CHECK:
+            return 0;
+        case RETURN_CONST:
+            return 0;
+        case RETURN_GENERATOR:
+            return 0;
+        case RETURN_VALUE:
+            return 1;
+        case SEND:
+            return 2;
+        case SEND_GEN:
+            return 2;
+        case SETUP_ANNOTATIONS:
+            return 0;
+        case SET_ADD:
+            return 2 + (oparg-1);
+        case SET_FUNCTION_ATTRIBUTE:
+            return 2;
+        case SET_UPDATE:
+            return 2 + (oparg-1);
+        case STORE_ATTR:
+            return 2;
+        case STORE_ATTR_INSTANCE_VALUE:
+            return 2;
+        case STORE_ATTR_SLOT:
+            return 2;
+        case STORE_ATTR_WITH_HINT:
+            return 2;
+        case STORE_DEREF:
+            return 1;
+        case STORE_FAST:
+            return 1;
+        case STORE_FAST_LOAD_FAST:
+            return 1;
+        case STORE_FAST_STORE_FAST:
+            return 2;
+        case STORE_GLOBAL:
+            return 1;
+        case STORE_NAME:
+            return 1;
+        case STORE_SLICE:
+            return 4;
+        case STORE_SUBSCR:
+            return 3;
+        case STORE_SUBSCR_DICT:
+            return 3;
+        case STORE_SUBSCR_LIST_INT:
+            return 3;
+        case SWAP:
+            return 2 + (oparg-2);
+        case TO_BOOL:
+            return 1;
+        case TO_BOOL_ALWAYS_TRUE:
+            return 1;
+        case TO_BOOL_BOOL:
+            return 1;
+        case TO_BOOL_INT:
+            return 1;
+        case TO_BOOL_LIST:
+            return 1;
+        case TO_BOOL_NONE:
+            return 1;
+        case TO_BOOL_STR:
+            return 1;
+        case UNARY_INVERT:
+            return 1;
+        case UNARY_NEGATIVE:
+            return 1;
+        case UNARY_NOT:
+            return 1;
+        case UNPACK_EX:
+            return 1;
+        case UNPACK_SEQUENCE:
+            return 1;
+        case UNPACK_SEQUENCE_LIST:
+            return 1;
+        case UNPACK_SEQUENCE_TUPLE:
+            return 1;
+        case UNPACK_SEQUENCE_TWO_TUPLE:
+            return 1;
+        case WITH_EXCEPT_START:
+            return 4;
+        case YIELD_VALUE:
+            return 1;
+        default:
+            return -1;
+    }
+}
+
+#endif
+
+extern int _PyOpcode_num_pushed(int opcode, int oparg);
+#ifdef NEED_OPCODE_METADATA
+int _PyOpcode_num_pushed(int opcode, int oparg)  {
+    switch(opcode) {
+        case BEFORE_ASYNC_WITH:
+            return 2;
+        case BEFORE_WITH:
+            return 2;
+        case BINARY_OP:
+            return 1;
+        case BINARY_OP_ADD_FLOAT:
+            return 1;
+        case BINARY_OP_ADD_INT:
+            return 1;
+        case BINARY_OP_ADD_UNICODE:
+            return 1;
+        case BINARY_OP_INPLACE_ADD_UNICODE:
+            return 0;
+        case BINARY_OP_MULTIPLY_FLOAT:
+            return 1;
+        case BINARY_OP_MULTIPLY_INT:
+            return 1;
+        case BINARY_OP_SUBTRACT_FLOAT:
+            return 1;
+        case BINARY_OP_SUBTRACT_INT:
+            return 1;
+        case BINARY_SLICE:
+            return 1;
+        case BINARY_SUBSCR:
+            return 1;
+        case BINARY_SUBSCR_DICT:
+            return 1;
+        case BINARY_SUBSCR_GETITEM:
+            return 1;
+        case BINARY_SUBSCR_LIST_INT:
+            return 1;
+        case BINARY_SUBSCR_STR_INT:
+            return 1;
+        case BINARY_SUBSCR_TUPLE_INT:
+            return 1;
+        case BUILD_CONST_KEY_MAP:
+            return 1;
+        case BUILD_LIST:
+            return 1;
+        case BUILD_MAP:
+            return 1;
+        case BUILD_SET:
+            return 1;
+        case BUILD_SLICE:
+            return 1;
+        case BUILD_STRING:
+            return 1;
+        case BUILD_TUPLE:
+            return 1;
+        case CACHE:
+            return 0;
+        case CALL:
+            return 1;
+        case CALL_ALLOC_AND_ENTER_INIT:
+            return 1;
+        case CALL_BOUND_METHOD_EXACT_ARGS:
+            return 0;
+        case CALL_BOUND_METHOD_GENERAL:
+            return 0;
+        case CALL_BUILTIN_CLASS:
+            return 1;
+        case CALL_BUILTIN_FAST:
+            return 1;
+        case CALL_BUILTIN_FAST_WITH_KEYWORDS:
+            return 1;
+        case CALL_BUILTIN_O:
+            return 1;
+        case CALL_FUNCTION_EX:
+            return 1;
+        case CALL_INTRINSIC_1:
+            return 1;
+        case CALL_INTRINSIC_2:
+            return 1;
+        case CALL_ISINSTANCE:
+            return 1;
+        case CALL_KW:
+            return 1;
+        case CALL_LEN:
+            return 1;
+        case CALL_LIST_APPEND:
+            return 1;
+        case CALL_METHOD_DESCRIPTOR_FAST:
+            return 1;
+        case CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS:
+            return 1;
+        case CALL_METHOD_DESCRIPTOR_NOARGS:
+            return 1;
+        case CALL_METHOD_DESCRIPTOR_O:
+            return 1;
+        case CALL_NON_PY_GENERAL:
+            return 1;
+        case CALL_PY_EXACT_ARGS:
+            return 0;
+        case CALL_PY_GENERAL:
+            return 0;
+        case CALL_STR_1:
+            return 1;
+        case CALL_TUPLE_1:
+            return 1;
+        case CALL_TYPE_1:
+            return 1;
+        case CHECK_EG_MATCH:
+            return 2;
+        case CHECK_EXC_MATCH:
+            return 2;
+        case CLEANUP_THROW:
+            return 2;
+        case COMPARE_OP:
+            return 1;
+        case COMPARE_OP_FLOAT:
+            return 1;
+        case COMPARE_OP_INT:
+            return 1;
+        case COMPARE_OP_STR:
+            return 1;
+        case CONTAINS_OP:
+            return 1;
+        case CONTAINS_OP_DICT:
+            return 1;
+        case CONTAINS_OP_SET:
+            return 1;
+        case CONVERT_VALUE:
+            return 1;
+        case COPY:
+            return 2 + (oparg-1);
+        case COPY_FREE_VARS:
+            return 0;
+        case DELETE_ATTR:
+            return 0;
+        case DELETE_DEREF:
+            return 0;
+        case DELETE_FAST:
+            return 0;
+        case DELETE_GLOBAL:
+            return 0;
+        case DELETE_NAME:
+            return 0;
+        case DELETE_SUBSCR:
+            return 0;
+        case DICT_MERGE:
+            return 4 + (oparg - 1);
+        case DICT_UPDATE:
+            return 1 + (oparg - 1);
+        case END_ASYNC_FOR:
+            return 0;
+        case END_FOR:
+            return 0;
+        case END_SEND:
+            return 1;
+        case ENTER_EXECUTOR:
+            return 0;
+        case EXIT_INIT_CHECK:
+            return 0;
+        case EXTENDED_ARG:
+            return 0;
+        case FORMAT_SIMPLE:
+            return 1;
+        case FORMAT_WITH_SPEC:
+            return 1;
+        case FOR_ITER:
+            return 2;
+        case FOR_ITER_GEN:
+            return 1;
+        case FOR_ITER_LIST:
+            return 2;
+        case FOR_ITER_RANGE:
+            return 2;
+        case FOR_ITER_TUPLE:
+            return 2;
+        case GET_AITER:
+            return 1;
+        case GET_ANEXT:
+            return 2;
+        case GET_AWAITABLE:
+            return 1;
+        case GET_ITER:
+            return 1;
+        case GET_LEN:
+            return 2;
+        case GET_YIELD_FROM_ITER:
+            return 1;
+        case IMPORT_FROM:
+            return 2;
+        case IMPORT_NAME:
+            return 1;
+        case INSTRUMENTED_CALL:
+            return 0;
+        case INSTRUMENTED_CALL_FUNCTION_EX:
+            return 0;
+        case INSTRUMENTED_CALL_KW:
+            return 0;
+        case INSTRUMENTED_END_FOR:
+            return 1;
+        case INSTRUMENTED_END_SEND:
+            return 1;
+        case INSTRUMENTED_FOR_ITER:
+            return 0;
+        case INSTRUMENTED_INSTRUCTION:
+            return 0;
+        case INSTRUMENTED_JUMP_BACKWARD:
+            return 0;
+        case INSTRUMENTED_JUMP_FORWARD:
+            return 0;
+        case INSTRUMENTED_LOAD_SUPER_ATTR:
+            return 1 + (oparg & 1);
+        case INSTRUMENTED_POP_JUMP_IF_FALSE:
+            return 0;
+        case INSTRUMENTED_POP_JUMP_IF_NONE:
+            return 0;
+        case INSTRUMENTED_POP_JUMP_IF_NOT_NONE:
+            return 0;
+        case INSTRUMENTED_POP_JUMP_IF_TRUE:
+            return 0;
+        case INSTRUMENTED_RESUME:
+            return 0;
+        case INSTRUMENTED_RETURN_CONST:
+            return 0;
+        case INSTRUMENTED_RETURN_VALUE:
+            return 0;
+        case INSTRUMENTED_YIELD_VALUE:
+            return 1;
+        case INTERPRETER_EXIT:
+            return 0;
+        case IS_OP:
+            return 1;
+        case JUMP_BACKWARD:
+            return 0;
+        case JUMP_BACKWARD_NO_INTERRUPT:
+            return 0;
+        case JUMP_FORWARD:
+            return 0;
+        case LIST_APPEND:
+            return 1 + (oparg-1);
+        case LIST_EXTEND:
+            return 1 + (oparg-1);
+        case LOAD_ASSERTION_ERROR:
+            return 1;
+        case LOAD_ATTR:
+            return 1 + (oparg & 1);
+        case LOAD_ATTR_CLASS:
+            return 1 + (oparg & 1);
+        case LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN:
+            return 1;
+        case LOAD_ATTR_INSTANCE_VALUE:
+            return 1 + (oparg & 1);
+        case LOAD_ATTR_METHOD_LAZY_DICT:
+            return 2;
+        case LOAD_ATTR_METHOD_NO_DICT:
+            return 2;
+        case LOAD_ATTR_METHOD_WITH_VALUES:
+            return 2;
+        case LOAD_ATTR_MODULE:
+            return 1 + (oparg & 1);
+        case LOAD_ATTR_NONDESCRIPTOR_NO_DICT:
+            return 1;
+        case LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES:
+            return 1;
+        case LOAD_ATTR_PROPERTY:
+            return 1;
+        case LOAD_ATTR_SLOT:
+            return 1 + (oparg & 1);
+        case LOAD_ATTR_WITH_HINT:
+            return 1 + (oparg & 1);
+        case LOAD_BUILD_CLASS:
+            return 1;
+        case LOAD_CONST:
+            return 1;
+        case LOAD_DEREF:
+            return 1;
+        case LOAD_FAST:
+            return 1;
+        case LOAD_FAST_AND_CLEAR:
+            return 1;
+        case LOAD_FAST_CHECK:
+            return 1;
+        case LOAD_FAST_LOAD_FAST:
+            return 2;
+        case LOAD_FROM_DICT_OR_DEREF:
+            return 1;
+        case LOAD_FROM_DICT_OR_GLOBALS:
+            return 1;
+        case LOAD_GLOBAL:
+            return 1 + (oparg & 1);
+        case LOAD_GLOBAL_BUILTIN:
+            return 1 + (oparg & 1);
+        case LOAD_GLOBAL_MODULE:
+            return 1 + (oparg & 1);
+        case LOAD_LOCALS:
+            return 1;
+        case LOAD_NAME:
+            return 1;
+        case LOAD_SUPER_ATTR:
+            return 1 + (oparg & 1);
+        case LOAD_SUPER_ATTR_ATTR:
+            return 1;
+        case LOAD_SUPER_ATTR_METHOD:
+            return 2;
+        case MAKE_CELL:
+            return 0;
+        case MAKE_FUNCTION:
+            return 1;
+        case MAP_ADD:
+            return 1 + (oparg - 1);
+        case MATCH_CLASS:
+            return 1;
+        case MATCH_KEYS:
+            return 3;
+        case MATCH_MAPPING:
+            return 2;
+        case MATCH_SEQUENCE:
+            return 2;
+        case NOP:
+            return 0;
+        case POP_EXCEPT:
+            return 0;
+        case POP_JUMP_IF_FALSE:
+            return 0;
+        case POP_JUMP_IF_NONE:
+            return 0;
+        case POP_JUMP_IF_NOT_NONE:
+            return 0;
+        case POP_JUMP_IF_TRUE:
+            return 0;
+        case POP_TOP:
+            return 0;
+        case PUSH_EXC_INFO:
+            return 2;
+        case PUSH_NULL:
+            return 1;
+        case RAISE_VARARGS:
+            return 0;
+        case RERAISE:
+            return oparg;
+        case RESERVED:
+            return 0;
+        case RESUME:
+            return 0;
+        case RESUME_CHECK:
+            return 0;
+        case RETURN_CONST:
+            return 0;
+        case RETURN_GENERATOR:
+            return 1;
+        case RETURN_VALUE:
+            return 0;
+        case SEND:
+            return 2;
+        case SEND_GEN:
+            return 2;
+        case SETUP_ANNOTATIONS:
+            return 0;
+        case SET_ADD:
+            return 1 + (oparg-1);
+        case SET_FUNCTION_ATTRIBUTE:
+            return 1;
+        case SET_UPDATE:
+            return 1 + (oparg-1);
+        case STORE_ATTR:
+            return 0;
+        case STORE_ATTR_INSTANCE_VALUE:
+            return 0;
+        case STORE_ATTR_SLOT:
+            return 0;
+        case STORE_ATTR_WITH_HINT:
+            return 0;
+        case STORE_DEREF:
+            return 0;
+        case STORE_FAST:
+            return 0;
+        case STORE_FAST_LOAD_FAST:
+            return 1;
+        case STORE_FAST_STORE_FAST:
+            return 0;
+        case STORE_GLOBAL:
+            return 0;
+        case STORE_NAME:
+            return 0;
+        case STORE_SLICE:
+            return 0;
+        case STORE_SUBSCR:
+            return 0;
+        case STORE_SUBSCR_DICT:
+            return 0;
+        case STORE_SUBSCR_LIST_INT:
+            return 0;
+        case SWAP:
+            return 2 + (oparg-2);
+        case TO_BOOL:
+            return 1;
+        case TO_BOOL_ALWAYS_TRUE:
+            return 1;
+        case TO_BOOL_BOOL:
+            return 1;
+        case TO_BOOL_INT:
+            return 1;
+        case TO_BOOL_LIST:
+            return 1;
+        case TO_BOOL_NONE:
+            return 1;
+        case TO_BOOL_STR:
+            return 1;
+        case UNARY_INVERT:
+            return 1;
+        case UNARY_NEGATIVE:
+            return 1;
+        case UNARY_NOT:
+            return 1;
+        case UNPACK_EX:
+            return 1 + (oparg >> 8) + (oparg & 0xFF);
+        case UNPACK_SEQUENCE:
+            return oparg;
+        case UNPACK_SEQUENCE_LIST:
+            return oparg;
+        case UNPACK_SEQUENCE_TUPLE:
+            return oparg;
+        case UNPACK_SEQUENCE_TWO_TUPLE:
+            return 2;
+        case WITH_EXCEPT_START:
+            return 5;
+        case YIELD_VALUE:
+            return 1;
+        default:
+            return -1;
+    }
+}
+
+#endif
+
+enum InstructionFormat {
+    INSTR_FMT_IB = 1,
+    INSTR_FMT_IBC = 2,
+    INSTR_FMT_IBC00 = 3,
+    INSTR_FMT_IBC000 = 4,
+    INSTR_FMT_IBC00000000 = 5,
+    INSTR_FMT_IX = 6,
+    INSTR_FMT_IXC = 7,
+    INSTR_FMT_IXC00 = 8,
+    INSTR_FMT_IXC000 = 9,
+};
+
+#define IS_VALID_OPCODE(OP) \
+    (((OP) >= 0) && ((OP) < 268) && \
+     (_PyOpcode_opcode_metadata[(OP)].valid_entry))
+
+#define HAS_ARG_FLAG (1)
+#define HAS_CONST_FLAG (2)
+#define HAS_NAME_FLAG (4)
+#define HAS_JUMP_FLAG (8)
+#define HAS_FREE_FLAG (16)
+#define HAS_LOCAL_FLAG (32)
+#define HAS_EVAL_BREAK_FLAG (64)
+#define HAS_DEOPT_FLAG (128)
+#define HAS_ERROR_FLAG (256)
+#define HAS_ESCAPES_FLAG (512)
+#define HAS_EXIT_FLAG (1024)
+#define HAS_PURE_FLAG (2048)
+#define HAS_PASSTHROUGH_FLAG (4096)
+#define HAS_OPARG_AND_1_FLAG (8192)
+#define HAS_ERROR_NO_POP_FLAG (16384)
+#define OPCODE_HAS_ARG(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_ARG_FLAG))
+#define OPCODE_HAS_CONST(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_CONST_FLAG))
+#define OPCODE_HAS_NAME(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_NAME_FLAG))
+#define OPCODE_HAS_JUMP(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_JUMP_FLAG))
+#define OPCODE_HAS_FREE(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_FREE_FLAG))
+#define OPCODE_HAS_LOCAL(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_LOCAL_FLAG))
+#define OPCODE_HAS_EVAL_BREAK(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_EVAL_BREAK_FLAG))
+#define OPCODE_HAS_DEOPT(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_DEOPT_FLAG))
+#define OPCODE_HAS_ERROR(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_ERROR_FLAG))
+#define OPCODE_HAS_ESCAPES(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_ESCAPES_FLAG))
+#define OPCODE_HAS_EXIT(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_EXIT_FLAG))
+#define OPCODE_HAS_PURE(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_PURE_FLAG))
+#define OPCODE_HAS_PASSTHROUGH(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_PASSTHROUGH_FLAG))
+#define OPCODE_HAS_OPARG_AND_1(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_OPARG_AND_1_FLAG))
+#define OPCODE_HAS_ERROR_NO_POP(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_ERROR_NO_POP_FLAG))
+
+#define OPARG_FULL 0
+#define OPARG_CACHE_1 1
+#define OPARG_CACHE_2 2
+#define OPARG_CACHE_4 4
+#define OPARG_TOP 5
+#define OPARG_BOTTOM 6
+#define OPARG_SAVE_RETURN_OFFSET 7
+#define OPARG_REPLACED 9
+
+struct opcode_metadata {
+    uint8_t valid_entry;
+    int8_t instr_format;
+    int16_t flags;
+};
+
+extern const struct opcode_metadata _PyOpcode_opcode_metadata[268];
+#ifdef NEED_OPCODE_METADATA
+const struct opcode_metadata _PyOpcode_opcode_metadata[268] = {
+    [BEFORE_ASYNC_WITH] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [BEFORE_WITH] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [BINARY_OP] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IXC, HAS_EXIT_FLAG },
+    [BINARY_OP_ADD_INT] = { true, INSTR_FMT_IXC, HAS_EXIT_FLAG | HAS_ERROR_FLAG },
+    [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IXC, HAS_EXIT_FLAG | HAS_ERROR_FLAG },
+    [BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IXC, HAS_LOCAL_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IXC, HAS_EXIT_FLAG },
+    [BINARY_OP_MULTIPLY_INT] = { true, INSTR_FMT_IXC, HAS_EXIT_FLAG | HAS_ERROR_FLAG },
+    [BINARY_OP_SUBTRACT_FLOAT] = { true, INSTR_FMT_IXC, HAS_EXIT_FLAG },
+    [BINARY_OP_SUBTRACT_INT] = { true, INSTR_FMT_IXC, HAS_EXIT_FLAG | HAS_ERROR_FLAG },
+    [BINARY_SLICE] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [BINARY_SUBSCR] = { true, INSTR_FMT_IXC, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [BINARY_SUBSCR_DICT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [BINARY_SUBSCR_GETITEM] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG },
+    [BINARY_SUBSCR_LIST_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG },
+    [BINARY_SUBSCR_STR_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG },
+    [BINARY_SUBSCR_TUPLE_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG },
+    [BUILD_CONST_KEY_MAP] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [BUILD_LIST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG },
+    [BUILD_MAP] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [BUILD_SET] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [BUILD_SLICE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG },
+    [BUILD_STRING] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG },
+    [BUILD_TUPLE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG },
+    [CACHE] = { true, INSTR_FMT_IX, 0 },
+    [CALL] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_ALLOC_AND_ENTER_INIT] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_BOUND_METHOD_EXACT_ARGS] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG },
+    [CALL_BOUND_METHOD_GENERAL] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_BUILTIN_CLASS] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_BUILTIN_FAST] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_BUILTIN_FAST_WITH_KEYWORDS] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_BUILTIN_O] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_FUNCTION_EX] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_INTRINSIC_1] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_INTRINSIC_2] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_ISINSTANCE] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_KW] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_LEN] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_LIST_APPEND] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG },
+    [CALL_METHOD_DESCRIPTOR_FAST] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_METHOD_DESCRIPTOR_NOARGS] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_METHOD_DESCRIPTOR_O] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_NON_PY_GENERAL] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_PY_EXACT_ARGS] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG },
+    [CALL_PY_GENERAL] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_STR_1] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_TUPLE_1] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CALL_TYPE_1] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_DEOPT_FLAG },
+    [CHECK_EG_MATCH] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CHECK_EXC_MATCH] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CLEANUP_THROW] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [COMPARE_OP] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [COMPARE_OP_FLOAT] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_EXIT_FLAG },
+    [COMPARE_OP_INT] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG },
+    [COMPARE_OP_STR] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_EXIT_FLAG },
+    [CONTAINS_OP] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CONTAINS_OP_DICT] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CONTAINS_OP_SET] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [CONVERT_VALUE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG },
+    [COPY] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_PURE_FLAG },
+    [COPY_FREE_VARS] = { true, INSTR_FMT_IB, HAS_ARG_FLAG },
+    [DELETE_ATTR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [DELETE_DEREF] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [DELETE_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [DELETE_GLOBAL] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [DELETE_NAME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [DELETE_SUBSCR] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [DICT_MERGE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [DICT_UPDATE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [END_ASYNC_FOR] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [END_FOR] = { true, INSTR_FMT_IX, HAS_PURE_FLAG },
+    [END_SEND] = { true, INSTR_FMT_IX, HAS_PURE_FLAG },
+    [ENTER_EXECUTOR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG },
+    [EXIT_INIT_CHECK] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [EXTENDED_ARG] = { true, INSTR_FMT_IB, HAS_ARG_FLAG },
+    [FORMAT_SIMPLE] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [FORMAT_WITH_SPEC] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [FOR_ITER] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_JUMP_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [FOR_ITER_GEN] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_DEOPT_FLAG },
+    [FOR_ITER_LIST] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_JUMP_FLAG | HAS_EXIT_FLAG },
+    [FOR_ITER_RANGE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_JUMP_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG },
+    [FOR_ITER_TUPLE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_JUMP_FLAG | HAS_EXIT_FLAG },
+    [GET_AITER] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [GET_ANEXT] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [GET_AWAITABLE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [GET_ITER] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [GET_LEN] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [GET_YIELD_FROM_ITER] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [IMPORT_FROM] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [IMPORT_NAME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [INSTRUMENTED_CALL] = { true, INSTR_FMT_IBC00, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [INSTRUMENTED_CALL_FUNCTION_EX] = { true, INSTR_FMT_IX, 0 },
+    [INSTRUMENTED_CALL_KW] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [INSTRUMENTED_END_FOR] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG },
+    [INSTRUMENTED_END_SEND] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG },
+    [INSTRUMENTED_FOR_ITER] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [INSTRUMENTED_INSTRUCTION] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [INSTRUMENTED_JUMP_BACKWARD] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG },
+    [INSTRUMENTED_JUMP_FORWARD] = { true, INSTR_FMT_IB, HAS_ARG_FLAG },
+    [INSTRUMENTED_LOAD_SUPER_ATTR] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG },
+    [INSTRUMENTED_POP_JUMP_IF_FALSE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG },
+    [INSTRUMENTED_POP_JUMP_IF_NONE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG },
+    [INSTRUMENTED_POP_JUMP_IF_NOT_NONE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG },
+    [INSTRUMENTED_POP_JUMP_IF_TRUE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG },
+    [INSTRUMENTED_RESUME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [INSTRUMENTED_RETURN_CONST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_CONST_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [INSTRUMENTED_RETURN_VALUE] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [INSTRUMENTED_YIELD_VALUE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [INTERPRETER_EXIT] = { true, INSTR_FMT_IX, HAS_ESCAPES_FLAG },
+    [IS_OP] = { true, INSTR_FMT_IB, HAS_ARG_FLAG },
+    [JUMP_BACKWARD] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_JUMP_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [JUMP_BACKWARD_NO_INTERRUPT] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_JUMP_FLAG },
+    [JUMP_FORWARD] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_JUMP_FLAG },
+    [LIST_APPEND] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG },
+    [LIST_EXTEND] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_ASSERTION_ERROR] = { true, INSTR_FMT_IX, 0 },
+    [LOAD_ATTR] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_ATTR_CLASS] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG },
+    [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_ATTR_INSTANCE_VALUE] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG },
+    [LOAD_ATTR_METHOD_LAZY_DICT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG },
+    [LOAD_ATTR_METHOD_NO_DICT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_EXIT_FLAG },
+    [LOAD_ATTR_METHOD_WITH_VALUES] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG },
+    [LOAD_ATTR_MODULE] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG },
+    [LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_EXIT_FLAG },
+    [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG },
+    [LOAD_ATTR_PROPERTY] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_ATTR_SLOT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG },
+    [LOAD_ATTR_WITH_HINT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG },
+    [LOAD_BUILD_CLASS] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_CONST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_CONST_FLAG | HAS_PURE_FLAG },
+    [LOAD_DEREF] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_PURE_FLAG },
+    [LOAD_FAST_AND_CLEAR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG },
+    [LOAD_FAST_CHECK] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_FAST_LOAD_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG },
+    [LOAD_FROM_DICT_OR_DEREF] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_FROM_DICT_OR_GLOBALS] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_GLOBAL] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_GLOBAL_BUILTIN] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_DEOPT_FLAG },
+    [LOAD_GLOBAL_MODULE] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_DEOPT_FLAG },
+    [LOAD_LOCALS] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_NAME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_SUPER_ATTR] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_SUPER_ATTR_ATTR] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_SUPER_ATTR_METHOD] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [MAKE_CELL] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG },
+    [MAKE_FUNCTION] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [MAP_ADD] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [MATCH_CLASS] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [MATCH_KEYS] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [MATCH_MAPPING] = { true, INSTR_FMT_IX, 0 },
+    [MATCH_SEQUENCE] = { true, INSTR_FMT_IX, 0 },
+    [NOP] = { true, INSTR_FMT_IX, HAS_PURE_FLAG },
+    [POP_EXCEPT] = { true, INSTR_FMT_IX, HAS_ESCAPES_FLAG },
+    [POP_JUMP_IF_FALSE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_JUMP_FLAG },
+    [POP_JUMP_IF_NONE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_JUMP_FLAG },
+    [POP_JUMP_IF_NOT_NONE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_JUMP_FLAG },
+    [POP_JUMP_IF_TRUE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_JUMP_FLAG },
+    [POP_TOP] = { true, INSTR_FMT_IX, HAS_PURE_FLAG },
+    [PUSH_EXC_INFO] = { true, INSTR_FMT_IX, 0 },
+    [PUSH_NULL] = { true, INSTR_FMT_IX, HAS_PURE_FLAG },
+    [RAISE_VARARGS] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [RERAISE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [RESERVED] = { true, INSTR_FMT_IX, 0 },
+    [RESUME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [RESUME_CHECK] = { true, INSTR_FMT_IX, HAS_DEOPT_FLAG },
+    [RETURN_CONST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_CONST_FLAG },
+    [RETURN_GENERATOR] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [RETURN_VALUE] = { true, INSTR_FMT_IX, 0 },
+    [SEND] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_JUMP_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
+    [SEND_GEN] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_DEOPT_FLAG },
+    [SETUP_ANNOTATIONS] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [SET_ADD] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [SET_FUNCTION_ATTRIBUTE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ESCAPES_FLAG },
+    [SET_UPDATE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [STORE_ATTR] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [STORE_ATTR_INSTANCE_VALUE] = { true, INSTR_FMT_IXC000, HAS_DEOPT_FLAG | HAS_EXIT_FLAG },
+    [STORE_ATTR_SLOT] = { true, INSTR_FMT_IXC000, HAS_EXIT_FLAG },
+    [STORE_ATTR_WITH_HINT] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG },
+    [STORE_DEREF] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ESCAPES_FLAG },
+    [STORE_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG },
+    [STORE_FAST_LOAD_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG },
+    [STORE_FAST_STORE_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG },
+    [STORE_GLOBAL] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [STORE_NAME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [STORE_SLICE] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [STORE_SUBSCR] = { true, INSTR_FMT_IXC, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [STORE_SUBSCR_DICT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [STORE_SUBSCR_LIST_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG },
+    [SWAP] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_PURE_FLAG },
+    [TO_BOOL] = { true, INSTR_FMT_IXC00, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [TO_BOOL_ALWAYS_TRUE] = { true, INSTR_FMT_IXC00, HAS_EXIT_FLAG },
+    [TO_BOOL_BOOL] = { true, INSTR_FMT_IXC00, HAS_EXIT_FLAG },
+    [TO_BOOL_INT] = { true, INSTR_FMT_IXC00, HAS_EXIT_FLAG | HAS_ESCAPES_FLAG },
+    [TO_BOOL_LIST] = { true, INSTR_FMT_IXC00, HAS_EXIT_FLAG },
+    [TO_BOOL_NONE] = { true, INSTR_FMT_IXC00, HAS_EXIT_FLAG },
+    [TO_BOOL_STR] = { true, INSTR_FMT_IXC00, HAS_EXIT_FLAG | HAS_ESCAPES_FLAG },
+    [UNARY_INVERT] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [UNARY_NEGATIVE] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [UNARY_NOT] = { true, INSTR_FMT_IX, HAS_PURE_FLAG },
+    [UNPACK_EX] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [UNPACK_SEQUENCE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [UNPACK_SEQUENCE_LIST] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_DEOPT_FLAG },
+    [UNPACK_SEQUENCE_TUPLE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_DEOPT_FLAG },
+    [UNPACK_SEQUENCE_TWO_TUPLE] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_DEOPT_FLAG },
+    [WITH_EXCEPT_START] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [YIELD_VALUE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ESCAPES_FLAG },
+    [JUMP] = { true, -1, HAS_ARG_FLAG | HAS_JUMP_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [JUMP_NO_INTERRUPT] = { true, -1, HAS_ARG_FLAG | HAS_JUMP_FLAG },
+    [LOAD_CLOSURE] = { true, -1, HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_PURE_FLAG },
+    [LOAD_METHOD] = { true, -1, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_SUPER_METHOD] = { true, -1, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_ZERO_SUPER_ATTR] = { true, -1, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [LOAD_ZERO_SUPER_METHOD] = { true, -1, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
+    [POP_BLOCK] = { true, -1, HAS_PURE_FLAG },
+    [SETUP_CLEANUP] = { true, -1, HAS_PURE_FLAG | HAS_ARG_FLAG },
+    [SETUP_FINALLY] = { true, -1, HAS_PURE_FLAG | HAS_ARG_FLAG },
+    [SETUP_WITH] = { true, -1, HAS_PURE_FLAG | HAS_ARG_FLAG },
+    [STORE_FAST_MAYBE_NULL] = { true, -1, HAS_ARG_FLAG | HAS_LOCAL_FLAG },
+};
+#endif
+
+#define MAX_UOP_PER_EXPANSION 8
+struct opcode_macro_expansion {
+    int nuops;
+    struct { int16_t uop; int8_t size; int8_t offset; } uops[MAX_UOP_PER_EXPANSION];
+};
+extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];
+
+#ifdef NEED_OPCODE_METADATA
+const struct opcode_macro_expansion
+_PyOpcode_macro_expansion[256] = {
+    [BINARY_OP] = { .nuops = 1, .uops = { { _BINARY_OP, 0, 0 } } },
+    [BINARY_OP_ADD_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_ADD_FLOAT, 0, 0 } } },
+    [BINARY_OP_ADD_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_ADD_INT, 0, 0 } } },
+    [BINARY_OP_ADD_UNICODE] = { .nuops = 2, .uops = { { _GUARD_BOTH_UNICODE, 0, 0 }, { _BINARY_OP_ADD_UNICODE, 0, 0 } } },
+    [BINARY_OP_MULTIPLY_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_MULTIPLY_FLOAT, 0, 0 } } },
+    [BINARY_OP_MULTIPLY_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_MULTIPLY_INT, 0, 0 } } },
+    [BINARY_OP_SUBTRACT_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_SUBTRACT_FLOAT, 0, 0 } } },
+    [BINARY_OP_SUBTRACT_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_SUBTRACT_INT, 0, 0 } } },
+    [BINARY_SLICE] = { .nuops = 1, .uops = { { _BINARY_SLICE, 0, 0 } } },
+    [BINARY_SUBSCR] = { .nuops = 1, .uops = { { _BINARY_SUBSCR, 0, 0 } } },
+    [BINARY_SUBSCR_DICT] = { .nuops = 1, .uops = { { _BINARY_SUBSCR_DICT, 0, 0 } } },
+    [BINARY_SUBSCR_LIST_INT] = { .nuops = 1, .uops = { { _BINARY_SUBSCR_LIST_INT, 0, 0 } } },
+    [BINARY_SUBSCR_STR_INT] = { .nuops = 1, .uops = { { _BINARY_SUBSCR_STR_INT, 0, 0 } } },
+    [BINARY_SUBSCR_TUPLE_INT] = { .nuops = 1, .uops = { { _BINARY_SUBSCR_TUPLE_INT, 0, 0 } } },
+    [BUILD_CONST_KEY_MAP] = { .nuops = 1, .uops = { { _BUILD_CONST_KEY_MAP, 0, 0 } } },
+    [BUILD_LIST] = { .nuops = 1, .uops = { { _BUILD_LIST, 0, 0 } } },
+    [BUILD_MAP] = { .nuops = 1, .uops = { { _BUILD_MAP, 0, 0 } } },
+    [BUILD_SLICE] = { .nuops = 1, .uops = { { _BUILD_SLICE, 0, 0 } } },
+    [BUILD_STRING] = { .nuops = 1, .uops = { { _BUILD_STRING, 0, 0 } } },
+    [BUILD_TUPLE] = { .nuops = 1, .uops = { { _BUILD_TUPLE, 0, 0 } } },
+    [CALL_BOUND_METHOD_EXACT_ARGS] = { .nuops = 8, .uops = { { _CHECK_PEP_523, 0, 0 }, { _CHECK_CALL_BOUND_METHOD_EXACT_ARGS, 0, 0 }, { _INIT_CALL_BOUND_METHOD_EXACT_ARGS, 0, 0 }, { _CHECK_FUNCTION_EXACT_ARGS, 2, 1 }, { _CHECK_STACK_SPACE, 0, 0 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { _SAVE_RETURN_OFFSET, 7, 3 }, { _PUSH_FRAME, 0, 0 } } },
+    [CALL_BOUND_METHOD_GENERAL] = { .nuops = 6, .uops = { { _CHECK_PEP_523, 0, 0 }, { _CHECK_METHOD_VERSION, 2, 1 }, { _EXPAND_METHOD, 0, 0 }, { _PY_FRAME_GENERAL, 0, 0 }, { _SAVE_RETURN_OFFSET, 7, 3 }, { _PUSH_FRAME, 0, 0 } } },
+    [CALL_BUILTIN_CLASS] = { .nuops = 2, .uops = { { _CALL_BUILTIN_CLASS, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_BUILTIN_FAST] = { .nuops = 2, .uops = { { _CALL_BUILTIN_FAST, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_BUILTIN_FAST_WITH_KEYWORDS] = { .nuops = 2, .uops = { { _CALL_BUILTIN_FAST_WITH_KEYWORDS, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_BUILTIN_O] = { .nuops = 2, .uops = { { _CALL_BUILTIN_O, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_INTRINSIC_1] = { .nuops = 1, .uops = { { _CALL_INTRINSIC_1, 0, 0 } } },
+    [CALL_INTRINSIC_2] = { .nuops = 1, .uops = { { _CALL_INTRINSIC_2, 0, 0 } } },
+    [CALL_ISINSTANCE] = { .nuops = 1, .uops = { { _CALL_ISINSTANCE, 0, 0 } } },
+    [CALL_LEN] = { .nuops = 1, .uops = { { _CALL_LEN, 0, 0 } } },
+    [CALL_METHOD_DESCRIPTOR_FAST] = { .nuops = 2, .uops = { { _CALL_METHOD_DESCRIPTOR_FAST, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS] = { .nuops = 2, .uops = { { _CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_METHOD_DESCRIPTOR_NOARGS] = { .nuops = 2, .uops = { { _CALL_METHOD_DESCRIPTOR_NOARGS, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_METHOD_DESCRIPTOR_O] = { .nuops = 2, .uops = { { _CALL_METHOD_DESCRIPTOR_O, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_NON_PY_GENERAL] = { .nuops = 3, .uops = { { _CHECK_IS_NOT_PY_CALLABLE, 0, 0 }, { _CALL_NON_PY_GENERAL, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_PY_EXACT_ARGS] = { .nuops = 6, .uops = { { _CHECK_PEP_523, 0, 0 }, { _CHECK_FUNCTION_EXACT_ARGS, 2, 1 }, { _CHECK_STACK_SPACE, 0, 0 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { _SAVE_RETURN_OFFSET, 7, 3 }, { _PUSH_FRAME, 0, 0 } } },
+    [CALL_PY_GENERAL] = { .nuops = 5, .uops = { { _CHECK_PEP_523, 0, 0 }, { _CHECK_FUNCTION_VERSION, 2, 1 }, { _PY_FRAME_GENERAL, 0, 0 }, { _SAVE_RETURN_OFFSET, 7, 3 }, { _PUSH_FRAME, 0, 0 } } },
+    [CALL_STR_1] = { .nuops = 2, .uops = { { _CALL_STR_1, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_TUPLE_1] = { .nuops = 2, .uops = { { _CALL_TUPLE_1, 0, 0 }, { _CHECK_PERIODIC, 0, 0 } } },
+    [CALL_TYPE_1] = { .nuops = 1, .uops = { { _CALL_TYPE_1, 0, 0 } } },
+    [CHECK_EG_MATCH] = { .nuops = 1, .uops = { { _CHECK_EG_MATCH, 0, 0 } } },
+    [CHECK_EXC_MATCH] = { .nuops = 1, .uops = { { _CHECK_EXC_MATCH, 0, 0 } } },
+    [COMPARE_OP] = { .nuops = 1, .uops = { { _COMPARE_OP, 0, 0 } } },
+    [COMPARE_OP_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _COMPARE_OP_FLOAT, 0, 0 } } },
+    [COMPARE_OP_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _COMPARE_OP_INT, 0, 0 } } },
+    [COMPARE_OP_STR] = { .nuops = 2, .uops = { { _GUARD_BOTH_UNICODE, 0, 0 }, { _COMPARE_OP_STR, 0, 0 } } },
+    [CONTAINS_OP] = { .nuops = 1, .uops = { { _CONTAINS_OP, 0, 0 } } },
+    [CONTAINS_OP_DICT] = { .nuops = 1, .uops = { { _CONTAINS_OP_DICT, 0, 0 } } },
+    [CONTAINS_OP_SET] = { .nuops = 1, .uops = { { _CONTAINS_OP_SET, 0, 0 } } },
+    [CONVERT_VALUE] = { .nuops = 1, .uops = { { _CONVERT_VALUE, 0, 0 } } },
+    [COPY] = { .nuops = 1, .uops = { { _COPY, 0, 0 } } },
+    [COPY_FREE_VARS] = { .nuops = 1, .uops = { { _COPY_FREE_VARS, 0, 0 } } },
+    [DELETE_ATTR] = { .nuops = 1, .uops = { { _DELETE_ATTR, 0, 0 } } },
+    [DELETE_DEREF] = { .nuops = 1, .uops = { { _DELETE_DEREF, 0, 0 } } },
+    [DELETE_FAST] = { .nuops = 1, .uops = { { _DELETE_FAST, 0, 0 } } },
+    [DELETE_GLOBAL] = { .nuops = 1, .uops = { { _DELETE_GLOBAL, 0, 0 } } },
+    [DELETE_NAME] = { .nuops = 1, .uops = { { _DELETE_NAME, 0, 0 } } },
+    [DELETE_SUBSCR] = { .nuops = 1, .uops = { { _DELETE_SUBSCR, 0, 0 } } },
+    [DICT_MERGE] = { .nuops = 1, .uops = { { _DICT_MERGE, 0, 0 } } },
+    [DICT_UPDATE] = { .nuops = 1, .uops = { { _DICT_UPDATE, 0, 0 } } },
+    [END_FOR] = { .nuops = 1, .uops = { { _POP_TOP, 0, 0 } } },
+    [END_SEND] = { .nuops = 1, .uops = { { _END_SEND, 0, 0 } } },
+    [EXIT_INIT_CHECK] = { .nuops = 1, .uops = { { _EXIT_INIT_CHECK, 0, 0 } } },
+    [FORMAT_SIMPLE] = { .nuops = 1, .uops = { { _FORMAT_SIMPLE, 0, 0 } } },
+    [FORMAT_WITH_SPEC] = { .nuops = 1, .uops = { { _FORMAT_WITH_SPEC, 0, 0 } } },
+    [FOR_ITER] = { .nuops = 1, .uops = { { _FOR_ITER, 9, 0 } } },
+    [FOR_ITER_GEN] = { .nuops = 3, .uops = { { _CHECK_PEP_523, 0, 0 }, { _FOR_ITER_GEN_FRAME, 0, 0 }, { _PUSH_FRAME, 0, 0 } } },
+    [FOR_ITER_LIST] = { .nuops = 3, .uops = { { _ITER_CHECK_LIST, 0, 0 }, { _ITER_JUMP_LIST, 9, 1 }, { _ITER_NEXT_LIST, 0, 0 } } },
+    [FOR_ITER_RANGE] = { .nuops = 3, .uops = { { _ITER_CHECK_RANGE, 0, 0 }, { _ITER_JUMP_RANGE, 9, 1 }, { _ITER_NEXT_RANGE, 0, 0 } } },
+    [FOR_ITER_TUPLE] = { .nuops = 3, .uops = { { _ITER_CHECK_TUPLE, 0, 0 }, { _ITER_JUMP_TUPLE, 9, 1 }, { _ITER_NEXT_TUPLE, 0, 0 } } },
+    [GET_AITER] = { .nuops = 1, .uops = { { _GET_AITER, 0, 0 } } },
+    [GET_ANEXT] = { .nuops = 1, .uops = { { _GET_ANEXT, 0, 0 } } },
+    [GET_AWAITABLE] = { .nuops = 1, .uops = { { _GET_AWAITABLE, 0, 0 } } },
+    [GET_ITER] = { .nuops = 1, .uops = { { _GET_ITER, 0, 0 } } },
+    [GET_LEN] = { .nuops = 1, .uops = { { _GET_LEN, 0, 0 } } },
+    [GET_YIELD_FROM_ITER] = { .nuops = 1, .uops = { { _GET_YIELD_FROM_ITER, 0, 0 } } },
+    [IS_OP] = { .nuops = 1, .uops = { { _IS_OP, 0, 0 } } },
+    [LIST_APPEND] = { .nuops = 1, .uops = { { _LIST_APPEND, 0, 0 } } },
+    [LIST_EXTEND] = { .nuops = 1, .uops = { { _LIST_EXTEND, 0, 0 } } },
+    [LOAD_ASSERTION_ERROR] = { .nuops = 1, .uops = { { _LOAD_ASSERTION_ERROR, 0, 0 } } },
+    [LOAD_ATTR] = { .nuops = 1, .uops = { { _LOAD_ATTR, 0, 0 } } },
+    [LOAD_ATTR_CLASS] = { .nuops = 2, .uops = { { _CHECK_ATTR_CLASS, 2, 1 }, { _LOAD_ATTR_CLASS, 4, 5 } } },
+    [LOAD_ATTR_INSTANCE_VALUE] = { .nuops = 3, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_MANAGED_OBJECT_HAS_VALUES, 0, 0 }, { _LOAD_ATTR_INSTANCE_VALUE, 1, 3 } } },
+    [LOAD_ATTR_METHOD_LAZY_DICT] = { .nuops = 3, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_ATTR_METHOD_LAZY_DICT, 1, 3 }, { _LOAD_ATTR_METHOD_LAZY_DICT, 4, 5 } } },
+    [LOAD_ATTR_METHOD_NO_DICT] = { .nuops = 2, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _LOAD_ATTR_METHOD_NO_DICT, 4, 5 } } },
+    [LOAD_ATTR_METHOD_WITH_VALUES] = { .nuops = 4, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT, 0, 0 }, { _GUARD_KEYS_VERSION, 2, 3 }, { _LOAD_ATTR_METHOD_WITH_VALUES, 4, 5 } } },
+    [LOAD_ATTR_MODULE] = { .nuops = 2, .uops = { { _CHECK_ATTR_MODULE, 2, 1 }, { _LOAD_ATTR_MODULE, 1, 3 } } },
+    [LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = { .nuops = 2, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _LOAD_ATTR_NONDESCRIPTOR_NO_DICT, 4, 5 } } },
+    [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = { .nuops = 4, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT, 0, 0 }, { _GUARD_KEYS_VERSION, 2, 3 }, { _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES, 4, 5 } } },
+    [LOAD_ATTR_SLOT] = { .nuops = 2, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _LOAD_ATTR_SLOT, 1, 3 } } },
+    [LOAD_ATTR_WITH_HINT] = { .nuops = 3, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_ATTR_WITH_HINT, 0, 0 }, { _LOAD_ATTR_WITH_HINT, 1, 3 } } },
+    [LOAD_BUILD_CLASS] = { .nuops = 1, .uops = { { _LOAD_BUILD_CLASS, 0, 0 } } },
+    [LOAD_CONST] = { .nuops = 1, .uops = { { _LOAD_CONST, 0, 0 } } },
+    [LOAD_DEREF] = { .nuops = 1, .uops = { { _LOAD_DEREF, 0, 0 } } },
+    [LOAD_FAST] = { .nuops = 1, .uops = { { _LOAD_FAST, 0, 0 } } },
+    [LOAD_FAST_AND_CLEAR] = { .nuops = 1, .uops = { { _LOAD_FAST_AND_CLEAR, 0, 0 } } },
+    [LOAD_FAST_CHECK] = { .nuops = 1, .uops = { { _LOAD_FAST_CHECK, 0, 0 } } },
+    [LOAD_FAST_LOAD_FAST] = { .nuops = 2, .uops = { { _LOAD_FAST, 5, 0 }, { _LOAD_FAST, 6, 0 } } },
+    [LOAD_FROM_DICT_OR_DEREF] = { .nuops = 1, .uops = { { _LOAD_FROM_DICT_OR_DEREF, 0, 0 } } },
+    [LOAD_GLOBAL] = { .nuops = 1, .uops = { { _LOAD_GLOBAL, 0, 0 } } },
+    [LOAD_GLOBAL_BUILTIN] = { .nuops = 3, .uops = { { _GUARD_GLOBALS_VERSION, 1, 1 }, { _GUARD_BUILTINS_VERSION, 1, 2 }, { _LOAD_GLOBAL_BUILTINS, 1, 3 } } },
+    [LOAD_GLOBAL_MODULE] = { .nuops = 2, .uops = { { _GUARD_GLOBALS_VERSION, 1, 1 }, { _LOAD_GLOBAL_MODULE, 1, 3 } } },
+    [LOAD_LOCALS] = { .nuops = 1, .uops = { { _LOAD_LOCALS, 0, 0 } } },
+    [LOAD_SUPER_ATTR_ATTR] = { .nuops = 1, .uops = { { _LOAD_SUPER_ATTR_ATTR, 0, 0 } } },
+    [LOAD_SUPER_ATTR_METHOD] = { .nuops = 1, .uops = { { _LOAD_SUPER_ATTR_METHOD, 0, 0 } } },
+    [MAKE_CELL] = { .nuops = 1, .uops = { { _MAKE_CELL, 0, 0 } } },
+    [MAKE_FUNCTION] = { .nuops = 1, .uops = { { _MAKE_FUNCTION, 0, 0 } } },
+    [MAP_ADD] = { .nuops = 1, .uops = { { _MAP_ADD, 0, 0 } } },
+    [MATCH_CLASS] = { .nuops = 1, .uops = { { _MATCH_CLASS, 0, 0 } } },
+    [MATCH_KEYS] = { .nuops = 1, .uops = { { _MATCH_KEYS, 0, 0 } } },
+    [MATCH_MAPPING] = { .nuops = 1, .uops = { { _MATCH_MAPPING, 0, 0 } } },
+    [MATCH_SEQUENCE] = { .nuops = 1, .uops = { { _MATCH_SEQUENCE, 0, 0 } } },
+    [NOP] = { .nuops = 1, .uops = { { _NOP, 0, 0 } } },
+    [POP_EXCEPT] = { .nuops = 1, .uops = { { _POP_EXCEPT, 0, 0 } } },
+    [POP_JUMP_IF_FALSE] = { .nuops = 1, .uops = { { _POP_JUMP_IF_FALSE, 9, 1 } } },
+    [POP_JUMP_IF_NONE] = { .nuops = 2, .uops = { { _IS_NONE, 0, 0 }, { _POP_JUMP_IF_TRUE, 9, 1 } } },
+    [POP_JUMP_IF_NOT_NONE] = { .nuops = 2, .uops = { { _IS_NONE, 0, 0 }, { _POP_JUMP_IF_FALSE, 9, 1 } } },
+    [POP_JUMP_IF_TRUE] = { .nuops = 1, .uops = { { _POP_JUMP_IF_TRUE, 9, 1 } } },
+    [POP_TOP] = { .nuops = 1, .uops = { { _POP_TOP, 0, 0 } } },
+    [PUSH_EXC_INFO] = { .nuops = 1, .uops = { { _PUSH_EXC_INFO, 0, 0 } } },
+    [PUSH_NULL] = { .nuops = 1, .uops = { { _PUSH_NULL, 0, 0 } } },
+    [RESUME_CHECK] = { .nuops = 1, .uops = { { _RESUME_CHECK, 0, 0 } } },
+    [RETURN_CONST] = { .nuops = 2, .uops = { { _LOAD_CONST, 0, 0 }, { _POP_FRAME, 0, 0 } } },
+    [RETURN_GENERATOR] = { .nuops = 1, .uops = { { _RETURN_GENERATOR, 0, 0 } } },
+    [RETURN_VALUE] = { .nuops = 1, .uops = { { _POP_FRAME, 0, 0 } } },
+    [SETUP_ANNOTATIONS] = { .nuops = 1, .uops = { { _SETUP_ANNOTATIONS, 0, 0 } } },
+    [SET_ADD] = { .nuops = 1, .uops = { { _SET_ADD, 0, 0 } } },
+    [SET_FUNCTION_ATTRIBUTE] = { .nuops = 1, .uops = { { _SET_FUNCTION_ATTRIBUTE, 0, 0 } } },
+    [SET_UPDATE] = { .nuops = 1, .uops = { { _SET_UPDATE, 0, 0 } } },
+    [STORE_ATTR] = { .nuops = 1, .uops = { { _STORE_ATTR, 0, 0 } } },
+    [STORE_ATTR_INSTANCE_VALUE] = { .nuops = 3, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _GUARD_DORV_NO_DICT, 0, 0 }, { _STORE_ATTR_INSTANCE_VALUE, 1, 3 } } },
+    [STORE_ATTR_SLOT] = { .nuops = 2, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _STORE_ATTR_SLOT, 1, 3 } } },
+    [STORE_DEREF] = { .nuops = 1, .uops = { { _STORE_DEREF, 0, 0 } } },
+    [STORE_FAST] = { .nuops = 1, .uops = { { _STORE_FAST, 0, 0 } } },
+    [STORE_FAST_LOAD_FAST] = { .nuops = 2, .uops = { { _STORE_FAST, 5, 0 }, { _LOAD_FAST, 6, 0 } } },
+    [STORE_FAST_STORE_FAST] = { .nuops = 2, .uops = { { _STORE_FAST, 5, 0 }, { _STORE_FAST, 6, 0 } } },
+    [STORE_GLOBAL] = { .nuops = 1, .uops = { { _STORE_GLOBAL, 0, 0 } } },
+    [STORE_NAME] = { .nuops = 1, .uops = { { _STORE_NAME, 0, 0 } } },
+    [STORE_SLICE] = { .nuops = 1, .uops = { { _STORE_SLICE, 0, 0 } } },
+    [STORE_SUBSCR] = { .nuops = 1, .uops = { { _STORE_SUBSCR, 0, 0 } } },
+    [STORE_SUBSCR_DICT] = { .nuops = 1, .uops = { { _STORE_SUBSCR_DICT, 0, 0 } } },
+    [STORE_SUBSCR_LIST_INT] = { .nuops = 1, .uops = { { _STORE_SUBSCR_LIST_INT, 0, 0 } } },
+    [SWAP] = { .nuops = 1, .uops = { { _SWAP, 0, 0 } } },
+    [TO_BOOL] = { .nuops = 1, .uops = { { _TO_BOOL, 0, 0 } } },
+    [TO_BOOL_ALWAYS_TRUE] = { .nuops = 2, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _REPLACE_WITH_TRUE, 0, 0 } } },
+    [TO_BOOL_BOOL] = { .nuops = 1, .uops = { { _TO_BOOL_BOOL, 0, 0 } } },
+    [TO_BOOL_INT] = { .nuops = 1, .uops = { { _TO_BOOL_INT, 0, 0 } } },
+    [TO_BOOL_LIST] = { .nuops = 1, .uops = { { _TO_BOOL_LIST, 0, 0 } } },
+    [TO_BOOL_NONE] = { .nuops = 1, .uops = { { _TO_BOOL_NONE, 0, 0 } } },
+    [TO_BOOL_STR] = { .nuops = 1, .uops = { { _TO_BOOL_STR, 0, 0 } } },
+    [UNARY_INVERT] = { .nuops = 1, .uops = { { _UNARY_INVERT, 0, 0 } } },
+    [UNARY_NEGATIVE] = { .nuops = 1, .uops = { { _UNARY_NEGATIVE, 0, 0 } } },
+    [UNARY_NOT] = { .nuops = 1, .uops = { { _UNARY_NOT, 0, 0 } } },
+    [UNPACK_EX] = { .nuops = 1, .uops = { { _UNPACK_EX, 0, 0 } } },
+    [UNPACK_SEQUENCE] = { .nuops = 1, .uops = { { _UNPACK_SEQUENCE, 0, 0 } } },
+    [UNPACK_SEQUENCE_LIST] = { .nuops = 1, .uops = { { _UNPACK_SEQUENCE_LIST, 0, 0 } } },
+    [UNPACK_SEQUENCE_TUPLE] = { .nuops = 1, .uops = { { _UNPACK_SEQUENCE_TUPLE, 0, 0 } } },
+    [UNPACK_SEQUENCE_TWO_TUPLE] = { .nuops = 1, .uops = { { _UNPACK_SEQUENCE_TWO_TUPLE, 0, 0 } } },
+    [WITH_EXCEPT_START] = { .nuops = 1, .uops = { { _WITH_EXCEPT_START, 0, 0 } } },
+    [YIELD_VALUE] = { .nuops = 1, .uops = { { _YIELD_VALUE, 0, 0 } } },
+};
+#endif // NEED_OPCODE_METADATA
+
+extern const char *_PyOpcode_OpName[268];
+#ifdef NEED_OPCODE_METADATA
+const char *_PyOpcode_OpName[268] = {
+    [BEFORE_ASYNC_WITH] = "BEFORE_ASYNC_WITH",
+    [BEFORE_WITH] = "BEFORE_WITH",
+    [BINARY_OP] = "BINARY_OP",
+    [BINARY_OP_ADD_FLOAT] = "BINARY_OP_ADD_FLOAT",
+    [BINARY_OP_ADD_INT] = "BINARY_OP_ADD_INT",
+    [BINARY_OP_ADD_UNICODE] = "BINARY_OP_ADD_UNICODE",
+    [BINARY_OP_INPLACE_ADD_UNICODE] = "BINARY_OP_INPLACE_ADD_UNICODE",
+    [BINARY_OP_MULTIPLY_FLOAT] = "BINARY_OP_MULTIPLY_FLOAT",
+    [BINARY_OP_MULTIPLY_INT] = "BINARY_OP_MULTIPLY_INT",
+    [BINARY_OP_SUBTRACT_FLOAT] = "BINARY_OP_SUBTRACT_FLOAT",
+    [BINARY_OP_SUBTRACT_INT] = "BINARY_OP_SUBTRACT_INT",
+    [BINARY_SLICE] = "BINARY_SLICE",
+    [BINARY_SUBSCR] = "BINARY_SUBSCR",
+    [BINARY_SUBSCR_DICT] = "BINARY_SUBSCR_DICT",
+    [BINARY_SUBSCR_GETITEM] = "BINARY_SUBSCR_GETITEM",
+    [BINARY_SUBSCR_LIST_INT] = "BINARY_SUBSCR_LIST_INT",
+    [BINARY_SUBSCR_STR_INT] = "BINARY_SUBSCR_STR_INT",
+    [BINARY_SUBSCR_TUPLE_INT] = "BINARY_SUBSCR_TUPLE_INT",
+    [BUILD_CONST_KEY_MAP] = "BUILD_CONST_KEY_MAP",
+    [BUILD_LIST] = "BUILD_LIST",
+    [BUILD_MAP] = "BUILD_MAP",
+    [BUILD_SET] = "BUILD_SET",
+    [BUILD_SLICE] = "BUILD_SLICE",
+    [BUILD_STRING] = "BUILD_STRING",
+    [BUILD_TUPLE] = "BUILD_TUPLE",
+    [CACHE] = "CACHE",
+    [CALL] = "CALL",
+    [CALL_ALLOC_AND_ENTER_INIT] = "CALL_ALLOC_AND_ENTER_INIT",
+    [CALL_BOUND_METHOD_EXACT_ARGS] = "CALL_BOUND_METHOD_EXACT_ARGS",
+    [CALL_BOUND_METHOD_GENERAL] = "CALL_BOUND_METHOD_GENERAL",
+    [CALL_BUILTIN_CLASS] = "CALL_BUILTIN_CLASS",
+    [CALL_BUILTIN_FAST] = "CALL_BUILTIN_FAST",
+    [CALL_BUILTIN_FAST_WITH_KEYWORDS] = "CALL_BUILTIN_FAST_WITH_KEYWORDS",
+    [CALL_BUILTIN_O] = "CALL_BUILTIN_O",
+    [CALL_FUNCTION_EX] = "CALL_FUNCTION_EX",
+    [CALL_INTRINSIC_1] = "CALL_INTRINSIC_1",
+    [CALL_INTRINSIC_2] = "CALL_INTRINSIC_2",
+    [CALL_ISINSTANCE] = "CALL_ISINSTANCE",
+    [CALL_KW] = "CALL_KW",
+    [CALL_LEN] = "CALL_LEN",
+    [CALL_LIST_APPEND] = "CALL_LIST_APPEND",
+    [CALL_METHOD_DESCRIPTOR_FAST] = "CALL_METHOD_DESCRIPTOR_FAST",
+    [CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS] = "CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS",
+    [CALL_METHOD_DESCRIPTOR_NOARGS] = "CALL_METHOD_DESCRIPTOR_NOARGS",
+    [CALL_METHOD_DESCRIPTOR_O] = "CALL_METHOD_DESCRIPTOR_O",
+    [CALL_NON_PY_GENERAL] = "CALL_NON_PY_GENERAL",
+    [CALL_PY_EXACT_ARGS] = "CALL_PY_EXACT_ARGS",
+    [CALL_PY_GENERAL] = "CALL_PY_GENERAL",
+    [CALL_STR_1] = "CALL_STR_1",
+    [CALL_TUPLE_1] = "CALL_TUPLE_1",
+    [CALL_TYPE_1] = "CALL_TYPE_1",
+    [CHECK_EG_MATCH] = "CHECK_EG_MATCH",
+    [CHECK_EXC_MATCH] = "CHECK_EXC_MATCH",
+    [CLEANUP_THROW] = "CLEANUP_THROW",
+    [COMPARE_OP] = "COMPARE_OP",
+    [COMPARE_OP_FLOAT] = "COMPARE_OP_FLOAT",
+    [COMPARE_OP_INT] = "COMPARE_OP_INT",
+    [COMPARE_OP_STR] = "COMPARE_OP_STR",
+    [CONTAINS_OP] = "CONTAINS_OP",
+    [CONTAINS_OP_DICT] = "CONTAINS_OP_DICT",
+    [CONTAINS_OP_SET] = "CONTAINS_OP_SET",
+    [CONVERT_VALUE] = "CONVERT_VALUE",
+    [COPY] = "COPY",
+    [COPY_FREE_VARS] = "COPY_FREE_VARS",
+    [DELETE_ATTR] = "DELETE_ATTR",
+    [DELETE_DEREF] = "DELETE_DEREF",
+    [DELETE_FAST] = "DELETE_FAST",
+    [DELETE_GLOBAL] = "DELETE_GLOBAL",
+    [DELETE_NAME] = "DELETE_NAME",
+    [DELETE_SUBSCR] = "DELETE_SUBSCR",
+    [DICT_MERGE] = "DICT_MERGE",
+    [DICT_UPDATE] = "DICT_UPDATE",
+    [END_ASYNC_FOR] = "END_ASYNC_FOR",
+    [END_FOR] = "END_FOR",
+    [END_SEND] = "END_SEND",
+    [ENTER_EXECUTOR] = "ENTER_EXECUTOR",
+    [EXIT_INIT_CHECK] = "EXIT_INIT_CHECK",
+    [EXTENDED_ARG] = "EXTENDED_ARG",
+    [FORMAT_SIMPLE] = "FORMAT_SIMPLE",
+    [FORMAT_WITH_SPEC] = "FORMAT_WITH_SPEC",
+    [FOR_ITER] = "FOR_ITER",
+    [FOR_ITER_GEN] = "FOR_ITER_GEN",
+    [FOR_ITER_LIST] = "FOR_ITER_LIST",
+    [FOR_ITER_RANGE] = "FOR_ITER_RANGE",
+    [FOR_ITER_TUPLE] = "FOR_ITER_TUPLE",
+    [GET_AITER] = "GET_AITER",
+    [GET_ANEXT] = "GET_ANEXT",
+    [GET_AWAITABLE] = "GET_AWAITABLE",
+    [GET_ITER] = "GET_ITER",
+    [GET_LEN] = "GET_LEN",
+    [GET_YIELD_FROM_ITER] = "GET_YIELD_FROM_ITER",
+    [IMPORT_FROM] = "IMPORT_FROM",
+    [IMPORT_NAME] = "IMPORT_NAME",
+    [INSTRUMENTED_CALL] = "INSTRUMENTED_CALL",
+    [INSTRUMENTED_CALL_FUNCTION_EX] = "INSTRUMENTED_CALL_FUNCTION_EX",
+    [INSTRUMENTED_CALL_KW] = "INSTRUMENTED_CALL_KW",
+    [INSTRUMENTED_END_FOR] = "INSTRUMENTED_END_FOR",
+    [INSTRUMENTED_END_SEND] = "INSTRUMENTED_END_SEND",
+    [INSTRUMENTED_FOR_ITER] = "INSTRUMENTED_FOR_ITER",
+    [INSTRUMENTED_INSTRUCTION] = "INSTRUMENTED_INSTRUCTION",
+    [INSTRUMENTED_JUMP_BACKWARD] = "INSTRUMENTED_JUMP_BACKWARD",
+    [INSTRUMENTED_JUMP_FORWARD] = "INSTRUMENTED_JUMP_FORWARD",
+    [INSTRUMENTED_LINE] = "INSTRUMENTED_LINE",
+    [INSTRUMENTED_LOAD_SUPER_ATTR] = "INSTRUMENTED_LOAD_SUPER_ATTR",
+    [INSTRUMENTED_POP_JUMP_IF_FALSE] = "INSTRUMENTED_POP_JUMP_IF_FALSE",
+    [INSTRUMENTED_POP_JUMP_IF_NONE] = "INSTRUMENTED_POP_JUMP_IF_NONE",
+    [INSTRUMENTED_POP_JUMP_IF_NOT_NONE] = "INSTRUMENTED_POP_JUMP_IF_NOT_NONE",
+    [INSTRUMENTED_POP_JUMP_IF_TRUE] = "INSTRUMENTED_POP_JUMP_IF_TRUE",
+    [INSTRUMENTED_RESUME] = "INSTRUMENTED_RESUME",
+    [INSTRUMENTED_RETURN_CONST] = "INSTRUMENTED_RETURN_CONST",
+    [INSTRUMENTED_RETURN_VALUE] = "INSTRUMENTED_RETURN_VALUE",
+    [INSTRUMENTED_YIELD_VALUE] = "INSTRUMENTED_YIELD_VALUE",
+    [INTERPRETER_EXIT] = "INTERPRETER_EXIT",
+    [IS_OP] = "IS_OP",
+    [JUMP] = "JUMP",
+    [JUMP_BACKWARD] = "JUMP_BACKWARD",
+    [JUMP_BACKWARD_NO_INTERRUPT] = "JUMP_BACKWARD_NO_INTERRUPT",
+    [JUMP_FORWARD] = "JUMP_FORWARD",
+    [JUMP_NO_INTERRUPT] = "JUMP_NO_INTERRUPT",
+    [LIST_APPEND] = "LIST_APPEND",
+    [LIST_EXTEND] = "LIST_EXTEND",
+    [LOAD_ASSERTION_ERROR] = "LOAD_ASSERTION_ERROR",
+    [LOAD_ATTR] = "LOAD_ATTR",
+    [LOAD_ATTR_CLASS] = "LOAD_ATTR_CLASS",
+    [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = "LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN",
+    [LOAD_ATTR_INSTANCE_VALUE] = "LOAD_ATTR_INSTANCE_VALUE",
+    [LOAD_ATTR_METHOD_LAZY_DICT] = "LOAD_ATTR_METHOD_LAZY_DICT",
+    [LOAD_ATTR_METHOD_NO_DICT] = "LOAD_ATTR_METHOD_NO_DICT",
+    [LOAD_ATTR_METHOD_WITH_VALUES] = "LOAD_ATTR_METHOD_WITH_VALUES",
+    [LOAD_ATTR_MODULE] = "LOAD_ATTR_MODULE",
+    [LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = "LOAD_ATTR_NONDESCRIPTOR_NO_DICT",
+    [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = "LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES",
+    [LOAD_ATTR_PROPERTY] = "LOAD_ATTR_PROPERTY",
+    [LOAD_ATTR_SLOT] = "LOAD_ATTR_SLOT",
+    [LOAD_ATTR_WITH_HINT] = "LOAD_ATTR_WITH_HINT",
+    [LOAD_BUILD_CLASS] = "LOAD_BUILD_CLASS",
+    [LOAD_CLOSURE] = "LOAD_CLOSURE",
+    [LOAD_CONST] = "LOAD_CONST",
+    [LOAD_DEREF] = "LOAD_DEREF",
+    [LOAD_FAST] = "LOAD_FAST",
+    [LOAD_FAST_AND_CLEAR] = "LOAD_FAST_AND_CLEAR",
+    [LOAD_FAST_CHECK] = "LOAD_FAST_CHECK",
+    [LOAD_FAST_LOAD_FAST] = "LOAD_FAST_LOAD_FAST",
+    [LOAD_FROM_DICT_OR_DEREF] = "LOAD_FROM_DICT_OR_DEREF",
+    [LOAD_FROM_DICT_OR_GLOBALS] = "LOAD_FROM_DICT_OR_GLOBALS",
+    [LOAD_GLOBAL] = "LOAD_GLOBAL",
+    [LOAD_GLOBAL_BUILTIN] = "LOAD_GLOBAL_BUILTIN",
+    [LOAD_GLOBAL_MODULE] = "LOAD_GLOBAL_MODULE",
+    [LOAD_LOCALS] = "LOAD_LOCALS",
+    [LOAD_METHOD] = "LOAD_METHOD",
+    [LOAD_NAME] = "LOAD_NAME",
+    [LOAD_SUPER_ATTR] = "LOAD_SUPER_ATTR",
+    [LOAD_SUPER_ATTR_ATTR] = "LOAD_SUPER_ATTR_ATTR",
+    [LOAD_SUPER_ATTR_METHOD] = "LOAD_SUPER_ATTR_METHOD",
+    [LOAD_SUPER_METHOD] = "LOAD_SUPER_METHOD",
+    [LOAD_ZERO_SUPER_ATTR] = "LOAD_ZERO_SUPER_ATTR",
+    [LOAD_ZERO_SUPER_METHOD] = "LOAD_ZERO_SUPER_METHOD",
+    [MAKE_CELL] = "MAKE_CELL",
+    [MAKE_FUNCTION] = "MAKE_FUNCTION",
+    [MAP_ADD] = "MAP_ADD",
+    [MATCH_CLASS] = "MATCH_CLASS",
+    [MATCH_KEYS] = "MATCH_KEYS",
+    [MATCH_MAPPING] = "MATCH_MAPPING",
+    [MATCH_SEQUENCE] = "MATCH_SEQUENCE",
+    [NOP] = "NOP",
+    [POP_BLOCK] = "POP_BLOCK",
+    [POP_EXCEPT] = "POP_EXCEPT",
+    [POP_JUMP_IF_FALSE] = "POP_JUMP_IF_FALSE",
+    [POP_JUMP_IF_NONE] = "POP_JUMP_IF_NONE",
+    [POP_JUMP_IF_NOT_NONE] = "POP_JUMP_IF_NOT_NONE",
+    [POP_JUMP_IF_TRUE] = "POP_JUMP_IF_TRUE",
+    [POP_TOP] = "POP_TOP",
+    [PUSH_EXC_INFO] = "PUSH_EXC_INFO",
+    [PUSH_NULL] = "PUSH_NULL",
+    [RAISE_VARARGS] = "RAISE_VARARGS",
+    [RERAISE] = "RERAISE",
+    [RESERVED] = "RESERVED",
+    [RESUME] = "RESUME",
+    [RESUME_CHECK] = "RESUME_CHECK",
+    [RETURN_CONST] = "RETURN_CONST",
+    [RETURN_GENERATOR] = "RETURN_GENERATOR",
+    [RETURN_VALUE] = "RETURN_VALUE",
+    [SEND] = "SEND",
+    [SEND_GEN] = "SEND_GEN",
+    [SETUP_ANNOTATIONS] = "SETUP_ANNOTATIONS",
+    [SETUP_CLEANUP] = "SETUP_CLEANUP",
+    [SETUP_FINALLY] = "SETUP_FINALLY",
+    [SETUP_WITH] = "SETUP_WITH",
+    [SET_ADD] = "SET_ADD",
+    [SET_FUNCTION_ATTRIBUTE] = "SET_FUNCTION_ATTRIBUTE",
+    [SET_UPDATE] = "SET_UPDATE",
+    [STORE_ATTR] = "STORE_ATTR",
+    [STORE_ATTR_INSTANCE_VALUE] = "STORE_ATTR_INSTANCE_VALUE",
+    [STORE_ATTR_SLOT] = "STORE_ATTR_SLOT",
+    [STORE_ATTR_WITH_HINT] = "STORE_ATTR_WITH_HINT",
+    [STORE_DEREF] = "STORE_DEREF",
+    [STORE_FAST] = "STORE_FAST",
+    [STORE_FAST_LOAD_FAST] = "STORE_FAST_LOAD_FAST",
+    [STORE_FAST_MAYBE_NULL] = "STORE_FAST_MAYBE_NULL",
+    [STORE_FAST_STORE_FAST] = "STORE_FAST_STORE_FAST",
+    [STORE_GLOBAL] = "STORE_GLOBAL",
+    [STORE_NAME] = "STORE_NAME",
+    [STORE_SLICE] = "STORE_SLICE",
+    [STORE_SUBSCR] = "STORE_SUBSCR",
+    [STORE_SUBSCR_DICT] = "STORE_SUBSCR_DICT",
+    [STORE_SUBSCR_LIST_INT] = "STORE_SUBSCR_LIST_INT",
+    [SWAP] = "SWAP",
+    [TO_BOOL] = "TO_BOOL",
+    [TO_BOOL_ALWAYS_TRUE] = "TO_BOOL_ALWAYS_TRUE",
+    [TO_BOOL_BOOL] = "TO_BOOL_BOOL",
+    [TO_BOOL_INT] = "TO_BOOL_INT",
+    [TO_BOOL_LIST] = "TO_BOOL_LIST",
+    [TO_BOOL_NONE] = "TO_BOOL_NONE",
+    [TO_BOOL_STR] = "TO_BOOL_STR",
+    [UNARY_INVERT] = "UNARY_INVERT",
+    [UNARY_NEGATIVE] = "UNARY_NEGATIVE",
+    [UNARY_NOT] = "UNARY_NOT",
+    [UNPACK_EX] = "UNPACK_EX",
+    [UNPACK_SEQUENCE] = "UNPACK_SEQUENCE",
+    [UNPACK_SEQUENCE_LIST] = "UNPACK_SEQUENCE_LIST",
+    [UNPACK_SEQUENCE_TUPLE] = "UNPACK_SEQUENCE_TUPLE",
+    [UNPACK_SEQUENCE_TWO_TUPLE] = "UNPACK_SEQUENCE_TWO_TUPLE",
+    [WITH_EXCEPT_START] = "WITH_EXCEPT_START",
+    [YIELD_VALUE] = "YIELD_VALUE",
+};
+#endif
+
+extern const uint8_t _PyOpcode_Caches[256];
+#ifdef NEED_OPCODE_METADATA
+const uint8_t _PyOpcode_Caches[256] = {
+    [JUMP_BACKWARD] = 1,
+    [TO_BOOL] = 3,
+    [BINARY_SUBSCR] = 1,
+    [STORE_SUBSCR] = 1,
+    [SEND] = 1,
+    [UNPACK_SEQUENCE] = 1,
+    [STORE_ATTR] = 4,
+    [LOAD_GLOBAL] = 4,
+    [LOAD_SUPER_ATTR] = 1,
+    [LOAD_ATTR] = 9,
+    [COMPARE_OP] = 1,
+    [CONTAINS_OP] = 1,
+    [POP_JUMP_IF_TRUE] = 1,
+    [POP_JUMP_IF_FALSE] = 1,
+    [POP_JUMP_IF_NONE] = 1,
+    [POP_JUMP_IF_NOT_NONE] = 1,
+    [FOR_ITER] = 1,
+    [CALL] = 3,
+    [BINARY_OP] = 1,
+};
+#endif
+
+extern const uint8_t _PyOpcode_Deopt[256];
+#ifdef NEED_OPCODE_METADATA
+const uint8_t _PyOpcode_Deopt[256] = {
+    [BEFORE_ASYNC_WITH] = BEFORE_ASYNC_WITH,
+    [BEFORE_WITH] = BEFORE_WITH,
+    [BINARY_OP] = BINARY_OP,
+    [BINARY_OP_ADD_FLOAT] = BINARY_OP,
+    [BINARY_OP_ADD_INT] = BINARY_OP,
+    [BINARY_OP_ADD_UNICODE] = BINARY_OP,
+    [BINARY_OP_INPLACE_ADD_UNICODE] = BINARY_OP,
+    [BINARY_OP_MULTIPLY_FLOAT] = BINARY_OP,
+    [BINARY_OP_MULTIPLY_INT] = BINARY_OP,
+    [BINARY_OP_SUBTRACT_FLOAT] = BINARY_OP,
+    [BINARY_OP_SUBTRACT_INT] = BINARY_OP,
+    [BINARY_SLICE] = BINARY_SLICE,
+    [BINARY_SUBSCR] = BINARY_SUBSCR,
+    [BINARY_SUBSCR_DICT] = BINARY_SUBSCR,
+    [BINARY_SUBSCR_GETITEM] = BINARY_SUBSCR,
+    [BINARY_SUBSCR_LIST_INT] = BINARY_SUBSCR,
+    [BINARY_SUBSCR_STR_INT] = BINARY_SUBSCR,
+    [BINARY_SUBSCR_TUPLE_INT] = BINARY_SUBSCR,
+    [BUILD_CONST_KEY_MAP] = BUILD_CONST_KEY_MAP,
+    [BUILD_LIST] = BUILD_LIST,
+    [BUILD_MAP] = BUILD_MAP,
+    [BUILD_SET] = BUILD_SET,
+    [BUILD_SLICE] = BUILD_SLICE,
+    [BUILD_STRING] = BUILD_STRING,
+    [BUILD_TUPLE] = BUILD_TUPLE,
+    [CACHE] = CACHE,
+    [CALL] = CALL,
+    [CALL_ALLOC_AND_ENTER_INIT] = CALL,
+    [CALL_BOUND_METHOD_EXACT_ARGS] = CALL,
+    [CALL_BOUND_METHOD_GENERAL] = CALL,
+    [CALL_BUILTIN_CLASS] = CALL,
+    [CALL_BUILTIN_FAST] = CALL,
+    [CALL_BUILTIN_FAST_WITH_KEYWORDS] = CALL,
+    [CALL_BUILTIN_O] = CALL,
+    [CALL_FUNCTION_EX] = CALL_FUNCTION_EX,
+    [CALL_INTRINSIC_1] = CALL_INTRINSIC_1,
+    [CALL_INTRINSIC_2] = CALL_INTRINSIC_2,
+    [CALL_ISINSTANCE] = CALL,
+    [CALL_KW] = CALL_KW,
+    [CALL_LEN] = CALL,
+    [CALL_LIST_APPEND] = CALL,
+    [CALL_METHOD_DESCRIPTOR_FAST] = CALL,
+    [CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS] = CALL,
+    [CALL_METHOD_DESCRIPTOR_NOARGS] = CALL,
+    [CALL_METHOD_DESCRIPTOR_O] = CALL,
+    [CALL_NON_PY_GENERAL] = CALL,
+    [CALL_PY_EXACT_ARGS] = CALL,
+    [CALL_PY_GENERAL] = CALL,
+    [CALL_STR_1] = CALL,
+    [CALL_TUPLE_1] = CALL,
+    [CALL_TYPE_1] = CALL,
+    [CHECK_EG_MATCH] = CHECK_EG_MATCH,
+    [CHECK_EXC_MATCH] = CHECK_EXC_MATCH,
+    [CLEANUP_THROW] = CLEANUP_THROW,
+    [COMPARE_OP] = COMPARE_OP,
+    [COMPARE_OP_FLOAT] = COMPARE_OP,
+    [COMPARE_OP_INT] = COMPARE_OP,
+    [COMPARE_OP_STR] = COMPARE_OP,
+    [CONTAINS_OP] = CONTAINS_OP,
+    [CONTAINS_OP_DICT] = CONTAINS_OP,
+    [CONTAINS_OP_SET] = CONTAINS_OP,
+    [CONVERT_VALUE] = CONVERT_VALUE,
+    [COPY] = COPY,
+    [COPY_FREE_VARS] = COPY_FREE_VARS,
+    [DELETE_ATTR] = DELETE_ATTR,
+    [DELETE_DEREF] = DELETE_DEREF,
+    [DELETE_FAST] = DELETE_FAST,
+    [DELETE_GLOBAL] = DELETE_GLOBAL,
+    [DELETE_NAME] = DELETE_NAME,
+    [DELETE_SUBSCR] = DELETE_SUBSCR,
+    [DICT_MERGE] = DICT_MERGE,
+    [DICT_UPDATE] = DICT_UPDATE,
+    [END_ASYNC_FOR] = END_ASYNC_FOR,
+    [END_FOR] = END_FOR,
+    [END_SEND] = END_SEND,
+    [ENTER_EXECUTOR] = ENTER_EXECUTOR,
+    [EXIT_INIT_CHECK] = EXIT_INIT_CHECK,
+    [EXTENDED_ARG] = EXTENDED_ARG,
+    [FORMAT_SIMPLE] = FORMAT_SIMPLE,
+    [FORMAT_WITH_SPEC] = FORMAT_WITH_SPEC,
+    [FOR_ITER] = FOR_ITER,
+    [FOR_ITER_GEN] = FOR_ITER,
+    [FOR_ITER_LIST] = FOR_ITER,
+    [FOR_ITER_RANGE] = FOR_ITER,
+    [FOR_ITER_TUPLE] = FOR_ITER,
+    [GET_AITER] = GET_AITER,
+    [GET_ANEXT] = GET_ANEXT,
+    [GET_AWAITABLE] = GET_AWAITABLE,
+    [GET_ITER] = GET_ITER,
+    [GET_LEN] = GET_LEN,
+    [GET_YIELD_FROM_ITER] = GET_YIELD_FROM_ITER,
+    [IMPORT_FROM] = IMPORT_FROM,
+    [IMPORT_NAME] = IMPORT_NAME,
+    [INSTRUMENTED_CALL] = INSTRUMENTED_CALL,
+    [INSTRUMENTED_CALL_FUNCTION_EX] = INSTRUMENTED_CALL_FUNCTION_EX,
+    [INSTRUMENTED_CALL_KW] = INSTRUMENTED_CALL_KW,
+    [INSTRUMENTED_END_FOR] = INSTRUMENTED_END_FOR,
+    [INSTRUMENTED_END_SEND] = INSTRUMENTED_END_SEND,
+    [INSTRUMENTED_FOR_ITER] = INSTRUMENTED_FOR_ITER,
+    [INSTRUMENTED_INSTRUCTION] = INSTRUMENTED_INSTRUCTION,
+    [INSTRUMENTED_JUMP_BACKWARD] = INSTRUMENTED_JUMP_BACKWARD,
+    [INSTRUMENTED_JUMP_FORWARD] = INSTRUMENTED_JUMP_FORWARD,
+    [INSTRUMENTED_LINE] = INSTRUMENTED_LINE,
+    [INSTRUMENTED_LOAD_SUPER_ATTR] = INSTRUMENTED_LOAD_SUPER_ATTR,
+    [INSTRUMENTED_POP_JUMP_IF_FALSE] = INSTRUMENTED_POP_JUMP_IF_FALSE,
+    [INSTRUMENTED_POP_JUMP_IF_NONE] = INSTRUMENTED_POP_JUMP_IF_NONE,
+    [INSTRUMENTED_POP_JUMP_IF_NOT_NONE] = INSTRUMENTED_POP_JUMP_IF_NOT_NONE,
+    [INSTRUMENTED_POP_JUMP_IF_TRUE] = INSTRUMENTED_POP_JUMP_IF_TRUE,
+    [INSTRUMENTED_RESUME] = INSTRUMENTED_RESUME,
+    [INSTRUMENTED_RETURN_CONST] = INSTRUMENTED_RETURN_CONST,
+    [INSTRUMENTED_RETURN_VALUE] = INSTRUMENTED_RETURN_VALUE,
+    [INSTRUMENTED_YIELD_VALUE] = INSTRUMENTED_YIELD_VALUE,
+    [INTERPRETER_EXIT] = INTERPRETER_EXIT,
+    [IS_OP] = IS_OP,
+    [JUMP_BACKWARD] = JUMP_BACKWARD,
+    [JUMP_BACKWARD_NO_INTERRUPT] = JUMP_BACKWARD_NO_INTERRUPT,
+    [JUMP_FORWARD] = JUMP_FORWARD,
+    [LIST_APPEND] = LIST_APPEND,
+    [LIST_EXTEND] = LIST_EXTEND,
+    [LOAD_ASSERTION_ERROR] = LOAD_ASSERTION_ERROR,
+    [LOAD_ATTR] = LOAD_ATTR,
+    [LOAD_ATTR_CLASS] = LOAD_ATTR,
+    [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = LOAD_ATTR,
+    [LOAD_ATTR_INSTANCE_VALUE] = LOAD_ATTR,
+    [LOAD_ATTR_METHOD_LAZY_DICT] = LOAD_ATTR,
+    [LOAD_ATTR_METHOD_NO_DICT] = LOAD_ATTR,
+    [LOAD_ATTR_METHOD_WITH_VALUES] = LOAD_ATTR,
+    [LOAD_ATTR_MODULE] = LOAD_ATTR,
+    [LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = LOAD_ATTR,
+    [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = LOAD_ATTR,
+    [LOAD_ATTR_PROPERTY] = LOAD_ATTR,
+    [LOAD_ATTR_SLOT] = LOAD_ATTR,
+    [LOAD_ATTR_WITH_HINT] = LOAD_ATTR,
+    [LOAD_BUILD_CLASS] = LOAD_BUILD_CLASS,
+    [LOAD_CONST] = LOAD_CONST,
+    [LOAD_DEREF] = LOAD_DEREF,
+    [LOAD_FAST] = LOAD_FAST,
+    [LOAD_FAST_AND_CLEAR] = LOAD_FAST_AND_CLEAR,
+    [LOAD_FAST_CHECK] = LOAD_FAST_CHECK,
+    [LOAD_FAST_LOAD_FAST] = LOAD_FAST_LOAD_FAST,
+    [LOAD_FROM_DICT_OR_DEREF] = LOAD_FROM_DICT_OR_DEREF,
+    [LOAD_FROM_DICT_OR_GLOBALS] = LOAD_FROM_DICT_OR_GLOBALS,
+    [LOAD_GLOBAL] = LOAD_GLOBAL,
+    [LOAD_GLOBAL_BUILTIN] = LOAD_GLOBAL,
+    [LOAD_GLOBAL_MODULE] = LOAD_GLOBAL,
+    [LOAD_LOCALS] = LOAD_LOCALS,
+    [LOAD_NAME] = LOAD_NAME,
+    [LOAD_SUPER_ATTR] = LOAD_SUPER_ATTR,
+    [LOAD_SUPER_ATTR_ATTR] = LOAD_SUPER_ATTR,
+    [LOAD_SUPER_ATTR_METHOD] = LOAD_SUPER_ATTR,
+    [MAKE_CELL] = MAKE_CELL,
+    [MAKE_FUNCTION] = MAKE_FUNCTION,
+    [MAP_ADD] = MAP_ADD,
+    [MATCH_CLASS] = MATCH_CLASS,
+    [MATCH_KEYS] = MATCH_KEYS,
+    [MATCH_MAPPING] = MATCH_MAPPING,
+    [MATCH_SEQUENCE] = MATCH_SEQUENCE,
+    [NOP] = NOP,
+    [POP_EXCEPT] = POP_EXCEPT,
+    [POP_JUMP_IF_FALSE] = POP_JUMP_IF_FALSE,
+    [POP_JUMP_IF_NONE] = POP_JUMP_IF_NONE,
+    [POP_JUMP_IF_NOT_NONE] = POP_JUMP_IF_NOT_NONE,
+    [POP_JUMP_IF_TRUE] = POP_JUMP_IF_TRUE,
+    [POP_TOP] = POP_TOP,
+    [PUSH_EXC_INFO] = PUSH_EXC_INFO,
+    [PUSH_NULL] = PUSH_NULL,
+    [RAISE_VARARGS] = RAISE_VARARGS,
+    [RERAISE] = RERAISE,
+    [RESERVED] = RESERVED,
+    [RESUME] = RESUME,
+    [RESUME_CHECK] = RESUME,
+    [RETURN_CONST] = RETURN_CONST,
+    [RETURN_GENERATOR] = RETURN_GENERATOR,
+    [RETURN_VALUE] = RETURN_VALUE,
+    [SEND] = SEND,
+    [SEND_GEN] = SEND,
+    [SETUP_ANNOTATIONS] = SETUP_ANNOTATIONS,
+    [SET_ADD] = SET_ADD,
+    [SET_FUNCTION_ATTRIBUTE] = SET_FUNCTION_ATTRIBUTE,
+    [SET_UPDATE] = SET_UPDATE,
+    [STORE_ATTR] = STORE_ATTR,
+    [STORE_ATTR_INSTANCE_VALUE] = STORE_ATTR,
+    [STORE_ATTR_SLOT] = STORE_ATTR,
+    [STORE_ATTR_WITH_HINT] = STORE_ATTR,
+    [STORE_DEREF] = STORE_DEREF,
+    [STORE_FAST] = STORE_FAST,
+    [STORE_FAST_LOAD_FAST] = STORE_FAST_LOAD_FAST,
+    [STORE_FAST_STORE_FAST] = STORE_FAST_STORE_FAST,
+    [STORE_GLOBAL] = STORE_GLOBAL,
+    [STORE_NAME] = STORE_NAME,
+    [STORE_SLICE] = STORE_SLICE,
+    [STORE_SUBSCR] = STORE_SUBSCR,
+    [STORE_SUBSCR_DICT] = STORE_SUBSCR,
+    [STORE_SUBSCR_LIST_INT] = STORE_SUBSCR,
+    [SWAP] = SWAP,
+    [TO_BOOL] = TO_BOOL,
+    [TO_BOOL_ALWAYS_TRUE] = TO_BOOL,
+    [TO_BOOL_BOOL] = TO_BOOL,
+    [TO_BOOL_INT] = TO_BOOL,
+    [TO_BOOL_LIST] = TO_BOOL,
+    [TO_BOOL_NONE] = TO_BOOL,
+    [TO_BOOL_STR] = TO_BOOL,
+    [UNARY_INVERT] = UNARY_INVERT,
+    [UNARY_NEGATIVE] = UNARY_NEGATIVE,
+    [UNARY_NOT] = UNARY_NOT,
+    [UNPACK_EX] = UNPACK_EX,
+    [UNPACK_SEQUENCE] = UNPACK_SEQUENCE,
+    [UNPACK_SEQUENCE_LIST] = UNPACK_SEQUENCE,
+    [UNPACK_SEQUENCE_TUPLE] = UNPACK_SEQUENCE,
+    [UNPACK_SEQUENCE_TWO_TUPLE] = UNPACK_SEQUENCE,
+    [WITH_EXCEPT_START] = WITH_EXCEPT_START,
+    [YIELD_VALUE] = YIELD_VALUE,
+};
+
+#endif // NEED_OPCODE_METADATA
+
+#define EXTRA_CASES \
+    case 119: \
+    case 120: \
+    case 121: \
+    case 122: \
+    case 123: \
+    case 124: \
+    case 125: \
+    case 126: \
+    case 127: \
+    case 128: \
+    case 129: \
+    case 130: \
+    case 131: \
+    case 132: \
+    case 133: \
+    case 134: \
+    case 135: \
+    case 136: \
+    case 137: \
+    case 138: \
+    case 139: \
+    case 140: \
+    case 141: \
+    case 142: \
+    case 143: \
+    case 144: \
+    case 145: \
+    case 146: \
+    case 147: \
+    case 148: \
+    case 223: \
+    case 224: \
+    case 225: \
+    case 226: \
+    case 227: \
+    case 228: \
+    case 229: \
+    case 230: \
+    case 231: \
+    case 232: \
+    case 233: \
+    case 234: \
+    case 235: \
+    case 255: \
+        ;
+struct pseudo_targets {
+    uint8_t targets[3];
+};
+extern const struct pseudo_targets _PyOpcode_PseudoTargets[12];
+#ifdef NEED_OPCODE_METADATA
+const struct pseudo_targets _PyOpcode_PseudoTargets[12] = {
+    [LOAD_CLOSURE-256] = { { LOAD_FAST, 0, 0 } },
+    [STORE_FAST_MAYBE_NULL-256] = { { STORE_FAST, 0, 0 } },
+    [LOAD_SUPER_METHOD-256] = { { LOAD_SUPER_ATTR, 0, 0 } },
+    [LOAD_ZERO_SUPER_METHOD-256] = { { LOAD_SUPER_ATTR, 0, 0 } },
+    [LOAD_ZERO_SUPER_ATTR-256] = { { LOAD_SUPER_ATTR, 0, 0 } },
+    [LOAD_METHOD-256] = { { LOAD_ATTR, 0, 0 } },
+    [JUMP-256] = { { JUMP_FORWARD, JUMP_BACKWARD, 0 } },
+    [JUMP_NO_INTERRUPT-256] = { { JUMP_FORWARD, JUMP_BACKWARD_NO_INTERRUPT, 0 } },
+    [SETUP_FINALLY-256] = { { NOP, 0, 0 } },
+    [SETUP_CLEANUP-256] = { { NOP, 0, 0 } },
+    [SETUP_WITH-256] = { { NOP, 0, 0 } },
+    [POP_BLOCK-256] = { { NOP, 0, 0 } },
+};
+
+#endif // NEED_OPCODE_METADATA
+static inline bool
+is_pseudo_target(int pseudo, int target) {
+    if (pseudo < 256 || pseudo >= 268) {
+        return false;
+    }
+    for (int i = 0; _PyOpcode_PseudoTargets[pseudo-256].targets[i]; i++) {
+        if (_PyOpcode_PseudoTargets[pseudo-256].targets[i] == target) return true;
+    }
+    return false;
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CORE_OPCODE_METADATA_H */
diff --git a/Include/internal/pycore_opcode_utils.h b/Include/internal/pycore_opcode_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..208bfb2f75308bc80e39d07e783026cf226789ac
--- /dev/null
+++ b/Include/internal/pycore_opcode_utils.h
@@ -0,0 +1,73 @@
+#ifndef Py_INTERNAL_OPCODE_UTILS_H
+#define Py_INTERNAL_OPCODE_UTILS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "opcode_ids.h"
+
+#define MAX_REAL_OPCODE 254
+
+#define IS_WITHIN_OPCODE_RANGE(opcode) \
+        (((opcode) >= 0 && (opcode) <= MAX_REAL_OPCODE) || \
+         IS_PSEUDO_INSTR(opcode))
+
+#define IS_BLOCK_PUSH_OPCODE(opcode) \
+        ((opcode) == SETUP_FINALLY || \
+         (opcode) == SETUP_WITH || \
+         (opcode) == SETUP_CLEANUP)
+
+#define HAS_TARGET(opcode) \
+        (OPCODE_HAS_JUMP(opcode) || IS_BLOCK_PUSH_OPCODE(opcode))
+
+/* opcodes that must be last in the basicblock */
+#define IS_TERMINATOR_OPCODE(opcode) \
+        (OPCODE_HAS_JUMP(opcode) || IS_SCOPE_EXIT_OPCODE(opcode))
+
+/* opcodes which are not emitted in codegen stage, only by the assembler */
+#define IS_ASSEMBLER_OPCODE(opcode) \
+        ((opcode) == JUMP_FORWARD || \
+         (opcode) == JUMP_BACKWARD || \
+         (opcode) == JUMP_BACKWARD_NO_INTERRUPT)
+
+#define IS_BACKWARDS_JUMP_OPCODE(opcode) \
+        ((opcode) == JUMP_BACKWARD || \
+         (opcode) == JUMP_BACKWARD_NO_INTERRUPT)
+
+#define IS_UNCONDITIONAL_JUMP_OPCODE(opcode) \
+        ((opcode) == JUMP || \
+         (opcode) == JUMP_NO_INTERRUPT || \
+         (opcode) == JUMP_FORWARD || \
+         (opcode) == JUMP_BACKWARD || \
+         (opcode) == JUMP_BACKWARD_NO_INTERRUPT)
+
+#define IS_SCOPE_EXIT_OPCODE(opcode) \
+        ((opcode) == RETURN_VALUE || \
+         (opcode) == RETURN_CONST || \
+         (opcode) == RAISE_VARARGS || \
+         (opcode) == RERAISE)
+
+
+/* Flags used in the oparg for MAKE_FUNCTION */
+#define MAKE_FUNCTION_DEFAULTS    0x01
+#define MAKE_FUNCTION_KWDEFAULTS  0x02
+#define MAKE_FUNCTION_ANNOTATIONS 0x04
+#define MAKE_FUNCTION_CLOSURE     0x08
+
+/* Values used in the oparg for RESUME */
+#define RESUME_AT_FUNC_START 0
+#define RESUME_AFTER_YIELD 1
+#define RESUME_AFTER_YIELD_FROM 2
+#define RESUME_AFTER_AWAIT 3
+
+#define RESUME_OPARG_LOCATION_MASK 0x3
+#define RESUME_OPARG_DEPTH1_MASK 0x4
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_OPCODE_UTILS_H */
diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..49aa67c6f3ccc0d111f721b575268228a930a35d
--- /dev/null
+++ b/Include/internal/pycore_optimizer.h
@@ -0,0 +1,272 @@
+#ifndef Py_INTERNAL_OPTIMIZER_H
+#define Py_INTERNAL_OPTIMIZER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_uop_ids.h"
+#include <stdbool.h>
+
+
+typedef struct _PyExecutorLinkListNode {
+    struct _PyExecutorObject *next;
+    struct _PyExecutorObject *previous;
+} _PyExecutorLinkListNode;
+
+
+/* Bloom filter with m = 256
+ * https://en.wikipedia.org/wiki/Bloom_filter */
+#define BLOOM_FILTER_WORDS 8
+
+typedef struct _bloom_filter {
+    uint32_t bits[BLOOM_FILTER_WORDS];
+} _PyBloomFilter;
+
+typedef struct {
+    uint8_t opcode;
+    uint8_t oparg;
+    uint8_t valid;
+    uint8_t linked;
+    int index;           // Index of ENTER_EXECUTOR (if code isn't NULL, below).
+    _PyBloomFilter bloom;
+    _PyExecutorLinkListNode links;
+    PyCodeObject *code;  // Weak (NULL if no corresponding ENTER_EXECUTOR).
+} _PyVMData;
+
+#define UOP_FORMAT_TARGET 0
+#define UOP_FORMAT_EXIT 1
+#define UOP_FORMAT_JUMP 2
+#define UOP_FORMAT_UNUSED 3
+
+/* Depending on the format,
+ * the 32 bits between the oparg and operand are:
+ * UOP_FORMAT_TARGET:
+ *    uint32_t target;
+ * UOP_FORMAT_EXIT
+ *    uint16_t exit_index;
+ *    uint16_t error_target;
+ * UOP_FORMAT_JUMP
+ *    uint16_t jump_target;
+ *    uint16_t error_target;
+ */
+typedef struct {
+    uint16_t opcode:14;
+    uint16_t format:2;
+    uint16_t oparg;
+    union {
+        uint32_t target;
+        struct {
+            union {
+                uint16_t exit_index;
+                uint16_t jump_target;
+            };
+            uint16_t error_target;
+        };
+    };
+    uint64_t operand;  // A cache entry
+} _PyUOpInstruction;
+
+static inline uint32_t uop_get_target(const _PyUOpInstruction *inst)
+{
+    assert(inst->format == UOP_FORMAT_TARGET);
+    return inst->target;
+}
+
+static inline uint16_t uop_get_exit_index(const _PyUOpInstruction *inst)
+{
+    assert(inst->format == UOP_FORMAT_EXIT);
+    return inst->exit_index;
+}
+
+static inline uint16_t uop_get_jump_target(const _PyUOpInstruction *inst)
+{
+    assert(inst->format == UOP_FORMAT_JUMP);
+    return inst->jump_target;
+}
+
+static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst)
+{
+    assert(inst->format != UOP_FORMAT_TARGET);
+    return inst->error_target;
+}
+
+typedef struct _exit_data {
+    uint32_t target;
+    _Py_BackoffCounter temperature;
+    const struct _PyExecutorObject *executor;
+} _PyExitData;
+
+typedef struct _PyExecutorObject {
+    PyObject_VAR_HEAD
+    const _PyUOpInstruction *trace;
+    _PyVMData vm_data; /* Used by the VM, but opaque to the optimizer */
+    uint32_t exit_count;
+    uint32_t code_size;
+    size_t jit_size;
+    void *jit_code;
+    void *jit_side_entry;
+    _PyExitData exits[1];
+} _PyExecutorObject;
+
+typedef struct _PyOptimizerObject _PyOptimizerObject;
+
+/* Should return > 0 if a new executor is created. O if no executor is produced and < 0 if an error occurred. */
+typedef int (*optimize_func)(
+    _PyOptimizerObject* self, struct _PyInterpreterFrame *frame,
+    _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr,
+    int curr_stackentries);
+
+struct _PyOptimizerObject {
+    PyObject_HEAD
+    optimize_func optimize;
+    /* Data needed by the optimizer goes here, but is opaque to the VM */
+};
+
+/** Test support **/
+typedef struct {
+    _PyOptimizerObject base;
+    int64_t count;
+} _PyCounterOptimizerObject;
+
+_PyOptimizerObject *_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject* optimizer);
+
+PyAPI_FUNC(int) _Py_SetTier2Optimizer(_PyOptimizerObject* optimizer);
+
+PyAPI_FUNC(_PyOptimizerObject *) _Py_GetOptimizer(void);
+
+PyAPI_FUNC(_PyExecutorObject *) _Py_GetExecutor(PyCodeObject *code, int offset);
+
+void _Py_ExecutorInit(_PyExecutorObject *, const _PyBloomFilter *);
+void _Py_ExecutorDetach(_PyExecutorObject *);
+void _Py_BloomFilter_Init(_PyBloomFilter *);
+void _Py_BloomFilter_Add(_PyBloomFilter *bloom, void *obj);
+PyAPI_FUNC(void) _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj);
+/* For testing */
+PyAPI_FUNC(PyObject *) _PyOptimizer_NewCounter(void);
+PyAPI_FUNC(PyObject *) _PyOptimizer_NewUOpOptimizer(void);
+
+#define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 3
+#define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6
+
+#ifdef _Py_TIER2
+PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation);
+PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation);
+#else
+#  define _Py_Executors_InvalidateDependency(A, B, C) ((void)0)
+#  define _Py_Executors_InvalidateAll(A, B) ((void)0)
+#endif
+
+
+// This is the length of the trace we project initially.
+#define UOP_MAX_TRACE_LENGTH 800
+
+#define TRACE_STACK_SIZE 5
+
+int _Py_uop_analyze_and_optimize(struct _PyInterpreterFrame *frame,
+    _PyUOpInstruction *trace, int trace_len, int curr_stackentries,
+    _PyBloomFilter *dependencies);
+
+extern PyTypeObject _PyCounterExecutor_Type;
+extern PyTypeObject _PyCounterOptimizer_Type;
+extern PyTypeObject _PyDefaultOptimizer_Type;
+extern PyTypeObject _PyUOpExecutor_Type;
+extern PyTypeObject _PyUOpOptimizer_Type;
+
+/* Symbols */
+/* See explanation in optimizer_symbols.c */
+
+struct _Py_UopsSymbol {
+    int flags;  // 0 bits: Top; 2 or more bits: Bottom
+    PyTypeObject *typ;  // Borrowed reference
+    PyObject *const_val;  // Owned reference (!)
+};
+
+// Holds locals, stack, locals, stack ... co_consts (in that order)
+#define MAX_ABSTRACT_INTERP_SIZE 4096
+
+#define TY_ARENA_SIZE (UOP_MAX_TRACE_LENGTH * 5)
+
+// Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH())
+#define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2)
+
+typedef struct _Py_UopsSymbol _Py_UopsSymbol;
+
+struct _Py_UOpsAbstractFrame {
+    // Max stacklen
+    int stack_len;
+    int locals_len;
+
+    _Py_UopsSymbol **stack_pointer;
+    _Py_UopsSymbol **stack;
+    _Py_UopsSymbol **locals;
+};
+
+typedef struct _Py_UOpsAbstractFrame _Py_UOpsAbstractFrame;
+
+typedef struct ty_arena {
+    int ty_curr_number;
+    int ty_max_number;
+    _Py_UopsSymbol arena[TY_ARENA_SIZE];
+} ty_arena;
+
+struct _Py_UOpsContext {
+    PyObject_HEAD
+    // The current "executing" frame.
+    _Py_UOpsAbstractFrame *frame;
+    _Py_UOpsAbstractFrame frames[MAX_ABSTRACT_FRAME_DEPTH];
+    int curr_frame_depth;
+
+    // Arena for the symbolic types.
+    ty_arena t_arena;
+
+    _Py_UopsSymbol **n_consumed;
+    _Py_UopsSymbol **limit;
+    _Py_UopsSymbol *locals_and_stack[MAX_ABSTRACT_INTERP_SIZE];
+};
+
+typedef struct _Py_UOpsContext _Py_UOpsContext;
+
+extern bool _Py_uop_sym_is_null(_Py_UopsSymbol *sym);
+extern bool _Py_uop_sym_is_not_null(_Py_UopsSymbol *sym);
+extern bool _Py_uop_sym_is_const(_Py_UopsSymbol *sym);
+extern PyObject *_Py_uop_sym_get_const(_Py_UopsSymbol *sym);
+extern _Py_UopsSymbol *_Py_uop_sym_new_unknown(_Py_UOpsContext *ctx);
+extern _Py_UopsSymbol *_Py_uop_sym_new_not_null(_Py_UOpsContext *ctx);
+extern _Py_UopsSymbol *_Py_uop_sym_new_type(
+    _Py_UOpsContext *ctx, PyTypeObject *typ);
+extern _Py_UopsSymbol *_Py_uop_sym_new_const(_Py_UOpsContext *ctx, PyObject *const_val);
+extern _Py_UopsSymbol *_Py_uop_sym_new_null(_Py_UOpsContext *ctx);
+extern bool _Py_uop_sym_has_type(_Py_UopsSymbol *sym);
+extern bool _Py_uop_sym_matches_type(_Py_UopsSymbol *sym, PyTypeObject *typ);
+extern bool _Py_uop_sym_set_null(_Py_UopsSymbol *sym);
+extern bool _Py_uop_sym_set_non_null(_Py_UopsSymbol *sym);
+extern bool _Py_uop_sym_set_type(_Py_UopsSymbol *sym, PyTypeObject *typ);
+extern bool _Py_uop_sym_set_const(_Py_UopsSymbol *sym, PyObject *const_val);
+extern bool _Py_uop_sym_is_bottom(_Py_UopsSymbol *sym);
+extern int _Py_uop_sym_truthiness(_Py_UopsSymbol *sym);
+extern PyTypeObject *_Py_uop_sym_get_type(_Py_UopsSymbol *sym);
+
+
+extern int _Py_uop_abstractcontext_init(_Py_UOpsContext *ctx);
+extern void _Py_uop_abstractcontext_fini(_Py_UOpsContext *ctx);
+
+extern _Py_UOpsAbstractFrame *_Py_uop_frame_new(
+    _Py_UOpsContext *ctx,
+    PyCodeObject *co,
+    int curr_stackentries,
+    _Py_UopsSymbol **args,
+    int arg_len);
+extern int _Py_uop_frame_pop(_Py_UOpsContext *ctx);
+
+PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored);
+
+PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer, _PyExecutorObject **exec_ptr);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_OPTIMIZER_H */
diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c9260e2636fbc7ec0396ac443bdd6fdbafb67df
--- /dev/null
+++ b/Include/internal/pycore_parking_lot.h
@@ -0,0 +1,97 @@
+// ParkingLot is an internal API for building efficient synchronization
+// primitives like mutexes and events.
+//
+// The API and name is inspired by WebKit's WTF::ParkingLot, which in turn
+// is inspired Linux's futex API.
+// See https://webkit.org/blog/6161/locking-in-webkit/.
+//
+// The core functionality is an atomic "compare-and-sleep" operation along with
+// an atomic "wake-up" operation.
+
+#ifndef Py_INTERNAL_PARKING_LOT_H
+#define Py_INTERNAL_PARKING_LOT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+enum {
+    // The thread was unparked by another thread.
+    Py_PARK_OK = 0,
+
+    // The value of `address` did not match `expected`.
+    Py_PARK_AGAIN = -1,
+
+    // The thread was unparked due to a timeout.
+    Py_PARK_TIMEOUT = -2,
+
+    // The thread was interrupted by a signal.
+    Py_PARK_INTR = -3,
+};
+
+// Checks that `*address == *expected` and puts the thread to sleep until an
+// unpark operation is called on the same `address`. Otherwise, the function
+// returns `Py_PARK_AGAIN`. The comparison behaves like memcmp, but is
+// performed atomically with respect to unpark operations.
+//
+// The `address_size` argument is the size of the data pointed to by the
+// `address` and `expected` pointers (i.e., sizeof(*address)). It must be
+// 1, 2, 4, or 8.
+//
+// The `timeout_ns` argument specifies the maximum amount of time to wait, with
+// -1 indicating an infinite wait.
+//
+// `park_arg`, which can be NULL, is passed to the unpark operation.
+//
+// If `detach` is true, then the thread will detach/release the GIL while
+// waiting.
+//
+// Example usage:
+//
+//  if (_Py_atomic_compare_exchange_uint8(address, &expected, new_value)) {
+//    int res = _PyParkingLot_Park(address, &new_value, sizeof(*address),
+//                                 timeout_ns, NULL, 1);
+//    ...
+//  }
+PyAPI_FUNC(int)
+_PyParkingLot_Park(const void *address, const void *expected,
+                   size_t address_size, PyTime_t timeout_ns,
+                   void *park_arg, int detach);
+
+// Callback for _PyParkingLot_Unpark:
+//
+// `arg` is the data of the same name provided to the _PyParkingLot_Unpark()
+//      call.
+// `park_arg` is the data provided to _PyParkingLot_Park() call or NULL if
+//      no waiting thread was found.
+// `has_more_waiters` is true if there are more threads waiting on the same
+//      address. May be true in cases where threads are waiting on a different
+//      address that map to the same internal bucket.
+typedef void _Py_unpark_fn_t(void *arg, void *park_arg, int has_more_waiters);
+
+// Unparks a single thread waiting on `address`.
+//
+// Note that fn() is called regardless of whether a thread was unparked. If
+// no threads are waiting on `address` then the `park_arg` argument to fn()
+// will be NULL.
+//
+// Example usage:
+//  void callback(void *arg, void *park_arg, int has_more_waiters);
+//  _PyParkingLot_Unpark(address, &callback, arg);
+PyAPI_FUNC(void)
+_PyParkingLot_Unpark(const void *address, _Py_unpark_fn_t *fn, void *arg);
+
+// Unparks all threads waiting on `address`.
+PyAPI_FUNC(void) _PyParkingLot_UnparkAll(const void *address);
+
+// Resets the parking lot state after a fork. Forgets all parked threads.
+PyAPI_FUNC(void) _PyParkingLot_AfterFork(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PARKING_LOT_H */
diff --git a/Include/internal/pycore_parser.h b/Include/internal/pycore_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..b16084aaa155155d2dc6cd7be581c6163b53b1ad
--- /dev/null
+++ b/Include/internal/pycore_parser.h
@@ -0,0 +1,95 @@
+#ifndef Py_INTERNAL_PARSER_H
+#define Py_INTERNAL_PARSER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+#include "pycore_ast.h"             // struct _expr
+#include "pycore_global_strings.h"  // _Py_DECLARE_STR()
+#include "pycore_pyarena.h"         // PyArena
+
+
+#ifdef Py_DEBUG
+#define _PYPEGEN_NSTATISTICS 2000
+#endif
+
+struct _parser_runtime_state {
+#ifdef Py_DEBUG
+    long memo_statistics[_PYPEGEN_NSTATISTICS];
+#ifdef Py_GIL_DISABLED
+    PyMutex mutex;
+#endif
+#else
+    int _not_used;
+#endif
+    struct _expr dummy_name;
+};
+
+_Py_DECLARE_STR(empty, "")
+#if defined(Py_DEBUG) && defined(Py_GIL_DISABLED)
+#define _parser_runtime_state_INIT \
+    { \
+        .mutex = {0}, \
+        .dummy_name = { \
+            .kind = Name_kind, \
+            .v.Name.id = &_Py_STR(empty), \
+            .v.Name.ctx = Load, \
+            .lineno = 1, \
+            .col_offset = 0, \
+            .end_lineno = 1, \
+            .end_col_offset = 0, \
+        }, \
+    }
+#else
+#define _parser_runtime_state_INIT \
+    { \
+        .dummy_name = { \
+            .kind = Name_kind, \
+            .v.Name.id = &_Py_STR(empty), \
+            .v.Name.ctx = Load, \
+            .lineno = 1, \
+            .col_offset = 0, \
+            .end_lineno = 1, \
+            .end_col_offset = 0, \
+        }, \
+    }
+#endif
+
+extern struct _mod* _PyParser_ASTFromString(
+    const char *str,
+    PyObject* filename,
+    int mode,
+    PyCompilerFlags *flags,
+    PyArena *arena);
+
+extern struct _mod* _PyParser_ASTFromFile(
+    FILE *fp,
+    PyObject *filename_ob,
+    const char *enc,
+    int mode,
+    const char *ps1,
+    const char *ps2,
+    PyCompilerFlags *flags,
+    int *errcode,
+    PyArena *arena);
+extern struct _mod* _PyParser_InteractiveASTFromFile(
+    FILE *fp,
+    PyObject *filename_ob,
+    const char *enc,
+    int mode,
+    const char *ps1,
+    const char *ps2,
+    PyCompilerFlags *flags,
+    int *errcode,
+    PyObject **interactive_src,
+    PyArena *arena);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PARSER_H */
diff --git a/Include/internal/pycore_pathconfig.h b/Include/internal/pycore_pathconfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1ce1b19a00283292f09cdd59cb12ad2b6edaac9
--- /dev/null
+++ b/Include/internal/pycore_pathconfig.h
@@ -0,0 +1,26 @@
+#ifndef Py_INTERNAL_PATHCONFIG_H
+#define Py_INTERNAL_PATHCONFIG_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(void) _PyPathConfig_ClearGlobal(void);
+
+extern PyStatus _PyPathConfig_ReadGlobal(PyConfig *config);
+extern PyStatus _PyPathConfig_UpdateGlobal(const PyConfig *config);
+extern const wchar_t * _PyPathConfig_GetGlobalModuleSearchPath(void);
+
+extern int _PyPathConfig_ComputeSysPath0(
+    const PyWideStringList *argv,
+    PyObject **path0);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PATHCONFIG_H */
diff --git a/Include/internal/pycore_pyarena.h b/Include/internal/pycore_pyarena.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f07479fb2ca27ff2adcf2c6e1f9fa3a75096a61
--- /dev/null
+++ b/Include/internal/pycore_pyarena.h
@@ -0,0 +1,68 @@
+// An arena-like memory interface for the compiler.
+
+#ifndef Py_INTERNAL_PYARENA_H
+#define Py_INTERNAL_PYARENA_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+typedef struct _arena PyArena;
+
+// _PyArena_New() and _PyArena_Free() create a new arena and free it,
+// respectively.  Once an arena has been created, it can be used
+// to allocate memory via _PyArena_Malloc().  Pointers to PyObject can
+// also be registered with the arena via _PyArena_AddPyObject(), and the
+// arena will ensure that the PyObjects stay alive at least until
+// _PyArena_Free() is called.  When an arena is freed, all the memory it
+// allocated is freed, the arena releases internal references to registered
+// PyObject*, and none of its pointers are valid.
+// XXX (tim) What does "none of its pointers are valid" mean?  Does it
+// XXX mean that pointers previously obtained via _PyArena_Malloc() are
+// XXX no longer valid?  (That's clearly true, but not sure that's what
+// XXX the text is trying to say.)
+//
+// _PyArena_New() returns an arena pointer.  On error, it
+// returns a negative number and sets an exception.
+// XXX (tim):  Not true.  On error, _PyArena_New() actually returns NULL,
+// XXX and looks like it may or may not set an exception (e.g., if the
+// XXX internal PyList_New(0) returns NULL, _PyArena_New() passes that on
+// XXX and an exception is set; OTOH, if the internal
+// XXX block_new(DEFAULT_BLOCK_SIZE) returns NULL, that's passed on but
+// XXX an exception is not set in that case).
+//
+// Export for test_peg_generator
+PyAPI_FUNC(PyArena*) _PyArena_New(void);
+
+// Export for test_peg_generator
+PyAPI_FUNC(void) _PyArena_Free(PyArena *);
+
+// Mostly like malloc(), return the address of a block of memory spanning
+// `size` bytes, or return NULL (without setting an exception) if enough
+// new memory can't be obtained.  Unlike malloc(0), _PyArena_Malloc() with
+// size=0 does not guarantee to return a unique pointer (the pointer
+// returned may equal one or more other pointers obtained from
+// _PyArena_Malloc()).
+// Note that pointers obtained via _PyArena_Malloc() must never be passed to
+// the system free() or realloc(), or to any of Python's similar memory-
+// management functions.  _PyArena_Malloc()-obtained pointers remain valid
+// until _PyArena_Free(ar) is called, at which point all pointers obtained
+// from the arena `ar` become invalid simultaneously.
+//
+// Export for test_peg_generator
+PyAPI_FUNC(void*) _PyArena_Malloc(PyArena *, size_t size);
+
+// This routine isn't a proper arena allocation routine.  It takes
+// a PyObject* and records it so that it can be DECREFed when the
+// arena is freed.
+//
+// Export for test_peg_generator
+PyAPI_FUNC(int) _PyArena_AddPyObject(PyArena *, PyObject *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYARENA_H */
diff --git a/Include/internal/pycore_pyatomic_ft_wrappers.h b/Include/internal/pycore_pyatomic_ft_wrappers.h
new file mode 100644
index 0000000000000000000000000000000000000000..d755d03a5fa190d3afc83ae6f4964b174a710d4d
--- /dev/null
+++ b/Include/internal/pycore_pyatomic_ft_wrappers.h
@@ -0,0 +1,165 @@
+// This header file provides wrappers around the atomic operations found in
+// `pyatomic.h` that are only atomic in free-threaded builds.
+//
+// These are intended to be used in places where atomics are required in
+// free-threaded builds, but not in the default build, and we don't want to
+// introduce the potential performance overhead of an atomic operation in the
+// default build.
+//
+// All usages of these macros should be replaced with unconditionally atomic or
+// non-atomic versions, and this file should be removed, once the dust settles
+// on free threading.
+#ifndef Py_ATOMIC_FT_WRAPPERS_H
+#define Py_ATOMIC_FT_WRAPPERS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef Py_GIL_DISABLED
+#define FT_ATOMIC_LOAD_PTR(value) _Py_atomic_load_ptr(&value)
+#define FT_ATOMIC_STORE_PTR(value, new_value) _Py_atomic_store_ptr(&value, new_value)
+#define FT_ATOMIC_LOAD_SSIZE(value) _Py_atomic_load_ssize(&value)
+#define FT_ATOMIC_LOAD_SSIZE_ACQUIRE(value) \
+    _Py_atomic_load_ssize_acquire(&value)
+#define FT_ATOMIC_LOAD_SSIZE_RELAXED(value) \
+    _Py_atomic_load_ssize_relaxed(&value)
+#define FT_ATOMIC_STORE_PTR(value, new_value) \
+    _Py_atomic_store_ptr(&value, new_value)
+#define FT_ATOMIC_LOAD_PTR_ACQUIRE(value) \
+    _Py_atomic_load_ptr_acquire(&value)
+#define FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(value) \
+    _Py_atomic_load_uintptr_acquire(&value)
+#define FT_ATOMIC_LOAD_PTR_RELAXED(value) \
+    _Py_atomic_load_ptr_relaxed(&value)
+#define FT_ATOMIC_LOAD_UINT8(value) \
+    _Py_atomic_load_uint8(&value)
+#define FT_ATOMIC_STORE_UINT8(value, new_value) \
+    _Py_atomic_store_uint8(&value, new_value)
+#define FT_ATOMIC_LOAD_UINT8_RELAXED(value) \
+    _Py_atomic_load_uint8_relaxed(&value)
+#define FT_ATOMIC_LOAD_UINT16_RELAXED(value) \
+    _Py_atomic_load_uint16_relaxed(&value)
+#define FT_ATOMIC_LOAD_UINT32_RELAXED(value) \
+    _Py_atomic_load_uint32_relaxed(&value)
+#define FT_ATOMIC_LOAD_ULONG_RELAXED(value) \
+    _Py_atomic_load_ulong_relaxed(&value)
+#define FT_ATOMIC_STORE_PTR_RELAXED(value, new_value) \
+    _Py_atomic_store_ptr_relaxed(&value, new_value)
+#define FT_ATOMIC_STORE_PTR_RELEASE(value, new_value) \
+    _Py_atomic_store_ptr_release(&value, new_value)
+#define FT_ATOMIC_STORE_UINTPTR_RELEASE(value, new_value) \
+    _Py_atomic_store_uintptr_release(&value, new_value)
+#define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) \
+    _Py_atomic_store_ssize_relaxed(&value, new_value)
+#define FT_ATOMIC_STORE_UINT8_RELAXED(value, new_value) \
+    _Py_atomic_store_uint8_relaxed(&value, new_value)
+#define FT_ATOMIC_STORE_UINT16_RELAXED(value, new_value) \
+    _Py_atomic_store_uint16_relaxed(&value, new_value)
+#define FT_ATOMIC_STORE_UINT32_RELAXED(value, new_value) \
+    _Py_atomic_store_uint32_relaxed(&value, new_value)
+#define FT_ATOMIC_STORE_CHAR_RELAXED(value, new_value) \
+    _Py_atomic_store_char_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_CHAR_RELAXED(value) \
+    _Py_atomic_load_char_relaxed(&value)
+#define FT_ATOMIC_STORE_UCHAR_RELAXED(value, new_value) \
+    _Py_atomic_store_uchar_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_UCHAR_RELAXED(value) \
+    _Py_atomic_load_uchar_relaxed(&value)
+#define FT_ATOMIC_STORE_SHORT_RELAXED(value, new_value) \
+    _Py_atomic_store_short_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_SHORT_RELAXED(value) \
+    _Py_atomic_load_short_relaxed(&value)
+#define FT_ATOMIC_STORE_USHORT_RELAXED(value, new_value) \
+    _Py_atomic_store_ushort_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_USHORT_RELAXED(value) \
+    _Py_atomic_load_ushort_relaxed(&value)
+#define FT_ATOMIC_STORE_INT_RELAXED(value, new_value) \
+    _Py_atomic_store_int_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_INT_RELAXED(value) \
+    _Py_atomic_load_int_relaxed(&value)
+#define FT_ATOMIC_STORE_UINT_RELAXED(value, new_value) \
+    _Py_atomic_store_uint_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_UINT_RELAXED(value) \
+    _Py_atomic_load_uint_relaxed(&value)
+#define FT_ATOMIC_STORE_LONG_RELAXED(value, new_value) \
+    _Py_atomic_store_long_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_LONG_RELAXED(value) \
+    _Py_atomic_load_long_relaxed(&value)
+#define FT_ATOMIC_STORE_ULONG_RELAXED(value, new_value) \
+    _Py_atomic_store_ulong_relaxed(&value, new_value)
+#define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) \
+    _Py_atomic_store_ssize_relaxed(&value, new_value)
+#define FT_ATOMIC_STORE_FLOAT_RELAXED(value, new_value) \
+    _Py_atomic_store_float_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_FLOAT_RELAXED(value) \
+    _Py_atomic_load_float_relaxed(&value)
+#define FT_ATOMIC_STORE_DOUBLE_RELAXED(value, new_value) \
+    _Py_atomic_store_double_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_DOUBLE_RELAXED(value) \
+    _Py_atomic_load_double_relaxed(&value)
+#define FT_ATOMIC_STORE_LLONG_RELAXED(value, new_value) \
+    _Py_atomic_store_llong_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_LLONG_RELAXED(value) \
+    _Py_atomic_load_llong_relaxed(&value)
+#define FT_ATOMIC_STORE_ULLONG_RELAXED(value, new_value) \
+    _Py_atomic_store_ullong_relaxed(&value, new_value)
+#define FT_ATOMIC_LOAD_ULLONG_RELAXED(value) \
+    _Py_atomic_load_ullong_relaxed(&value)
+
+#else
+#define FT_ATOMIC_LOAD_PTR(value) value
+#define FT_ATOMIC_STORE_PTR(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_SSIZE(value) value
+#define FT_ATOMIC_LOAD_SSIZE_ACQUIRE(value) value
+#define FT_ATOMIC_LOAD_SSIZE_RELAXED(value) value
+#define FT_ATOMIC_LOAD_PTR_ACQUIRE(value) value
+#define FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(value) value
+#define FT_ATOMIC_LOAD_PTR_RELAXED(value) value
+#define FT_ATOMIC_LOAD_UINT8(value) value
+#define FT_ATOMIC_STORE_UINT8(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_UINT8_RELAXED(value) value
+#define FT_ATOMIC_LOAD_UINT16_RELAXED(value) value
+#define FT_ATOMIC_LOAD_UINT32_RELAXED(value) value
+#define FT_ATOMIC_LOAD_ULONG_RELAXED(value) value
+#define FT_ATOMIC_STORE_PTR_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_STORE_PTR_RELEASE(value, new_value) value = new_value
+#define FT_ATOMIC_STORE_UINTPTR_RELEASE(value, new_value) value = new_value
+#define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_STORE_UINT8_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_STORE_UINT16_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_STORE_UINT32_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_CHAR_RELAXED(value) value
+#define FT_ATOMIC_STORE_CHAR_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_UCHAR_RELAXED(value) value
+#define FT_ATOMIC_STORE_UCHAR_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_SHORT_RELAXED(value) value
+#define FT_ATOMIC_STORE_SHORT_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_USHORT_RELAXED(value) value
+#define FT_ATOMIC_STORE_USHORT_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_INT_RELAXED(value) value
+#define FT_ATOMIC_STORE_INT_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_UINT_RELAXED(value) value
+#define FT_ATOMIC_STORE_UINT_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_LONG_RELAXED(value) value
+#define FT_ATOMIC_STORE_LONG_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_STORE_ULONG_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_FLOAT_RELAXED(value) value
+#define FT_ATOMIC_STORE_FLOAT_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_DOUBLE_RELAXED(value) value
+#define FT_ATOMIC_STORE_DOUBLE_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_LLONG_RELAXED(value) value
+#define FT_ATOMIC_STORE_LLONG_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_ULLONG_RELAXED(value) value
+#define FT_ATOMIC_STORE_ULLONG_RELAXED(value, new_value) value = new_value
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_ATOMIC_FT_WRAPPERS_H */
diff --git a/Include/internal/pycore_pybuffer.h b/Include/internal/pycore_pybuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9439d2bd770587d6c22620635254cfa5bf727b40
--- /dev/null
+++ b/Include/internal/pycore_pybuffer.h
@@ -0,0 +1,21 @@
+#ifndef Py_INTERNAL_PYBUFFER_H
+#define Py_INTERNAL_PYBUFFER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+// Exported for the _interpchannels module.
+PyAPI_FUNC(int) _PyBuffer_ReleaseInInterpreter(
+        PyInterpreterState *interp, Py_buffer *view);
+PyAPI_FUNC(int) _PyBuffer_ReleaseInInterpreterAndRawFree(
+        PyInterpreterState *interp, Py_buffer *view);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYBUFFER_H */
diff --git a/Include/internal/pycore_pyerrors.h b/Include/internal/pycore_pyerrors.h
new file mode 100644
index 0000000000000000000000000000000000000000..615cc23ec935284c9dbf7e2610a54c110727f4d6
--- /dev/null
+++ b/Include/internal/pycore_pyerrors.h
@@ -0,0 +1,190 @@
+#ifndef Py_INTERNAL_PYERRORS_H
+#define Py_INTERNAL_PYERRORS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* Error handling definitions */
+
+extern _PyErr_StackItem* _PyErr_GetTopmostException(PyThreadState *tstate);
+extern PyObject* _PyErr_GetHandledException(PyThreadState *);
+extern void _PyErr_SetHandledException(PyThreadState *, PyObject *);
+extern void _PyErr_GetExcInfo(PyThreadState *, PyObject **, PyObject **, PyObject **);
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(void) _PyErr_SetKeyError(PyObject *);
+
+
+// Like PyErr_Format(), but saves current exception as __context__ and
+// __cause__.
+// Export for '_sqlite3' shared extension.
+PyAPI_FUNC(PyObject*) _PyErr_FormatFromCause(
+    PyObject *exception,
+    const char *format,   /* ASCII-encoded string  */
+    ...
+    );
+
+extern int _PyException_AddNote(
+     PyObject *exc,
+     PyObject *note);
+
+extern int _PyErr_CheckSignals(void);
+
+/* Support for adding program text to SyntaxErrors */
+
+// Export for test_peg_generator
+PyAPI_FUNC(PyObject*) _PyErr_ProgramDecodedTextObject(
+    PyObject *filename,
+    int lineno,
+    const char* encoding);
+
+extern PyObject* _PyUnicodeTranslateError_Create(
+    PyObject *object,
+    Py_ssize_t start,
+    Py_ssize_t end,
+    const char *reason          /* UTF-8 encoded string */
+    );
+
+extern void _Py_NO_RETURN _Py_FatalErrorFormat(
+    const char *func,
+    const char *format,
+    ...);
+
+extern PyObject* _PyErr_SetImportErrorWithNameFrom(
+        PyObject *,
+        PyObject *,
+        PyObject *,
+        PyObject *);
+
+
+/* runtime lifecycle */
+
+extern PyStatus _PyErr_InitTypes(PyInterpreterState *);
+extern void _PyErr_FiniTypes(PyInterpreterState *);
+
+
+/* other API */
+
+static inline PyObject* _PyErr_Occurred(PyThreadState *tstate)
+{
+    assert(tstate != NULL);
+    if (tstate->current_exception == NULL) {
+        return NULL;
+    }
+    return (PyObject *)Py_TYPE(tstate->current_exception);
+}
+
+static inline void _PyErr_ClearExcState(_PyErr_StackItem *exc_state)
+{
+    Py_CLEAR(exc_state->exc_value);
+}
+
+extern PyObject* _PyErr_StackItemToExcInfoTuple(
+    _PyErr_StackItem *err_info);
+
+extern void _PyErr_Fetch(
+    PyThreadState *tstate,
+    PyObject **type,
+    PyObject **value,
+    PyObject **traceback);
+
+extern PyObject* _PyErr_GetRaisedException(PyThreadState *tstate);
+
+PyAPI_FUNC(int) _PyErr_ExceptionMatches(
+    PyThreadState *tstate,
+    PyObject *exc);
+
+extern void _PyErr_SetRaisedException(PyThreadState *tstate, PyObject *exc);
+
+extern void _PyErr_Restore(
+    PyThreadState *tstate,
+    PyObject *type,
+    PyObject *value,
+    PyObject *traceback);
+
+extern void _PyErr_SetObject(
+    PyThreadState *tstate,
+    PyObject *type,
+    PyObject *value);
+
+extern void _PyErr_ChainStackItem(void);
+
+PyAPI_FUNC(void) _PyErr_Clear(PyThreadState *tstate);
+
+extern void _PyErr_SetNone(PyThreadState *tstate, PyObject *exception);
+
+extern PyObject* _PyErr_NoMemory(PyThreadState *tstate);
+
+PyAPI_FUNC(void) _PyErr_SetString(
+    PyThreadState *tstate,
+    PyObject *exception,
+    const char *string);
+
+/*
+ * Set an exception with the error message decoded from the current locale
+ * encoding (LC_CTYPE).
+ *
+ * Exceptions occurring in decoding take priority over the desired exception.
+ *
+ * Exported for '_ctypes' shared extensions.
+ */
+PyAPI_FUNC(void) _PyErr_SetLocaleString(
+    PyObject *exception,
+    const char *string);
+
+PyAPI_FUNC(PyObject*) _PyErr_Format(
+    PyThreadState *tstate,
+    PyObject *exception,
+    const char *format,
+    ...);
+
+extern void _PyErr_NormalizeException(
+    PyThreadState *tstate,
+    PyObject **exc,
+    PyObject **val,
+    PyObject **tb);
+
+extern PyObject* _PyErr_FormatFromCauseTstate(
+    PyThreadState *tstate,
+    PyObject *exception,
+    const char *format,
+    ...);
+
+extern PyObject* _PyExc_CreateExceptionGroup(
+    const char *msg,
+    PyObject *excs);
+
+extern PyObject* _PyExc_PrepReraiseStar(
+    PyObject *orig,
+    PyObject *excs);
+
+extern int _PyErr_CheckSignalsTstate(PyThreadState *tstate);
+
+extern void _Py_DumpExtensionModules(int fd, PyInterpreterState *interp);
+extern PyObject* _Py_CalculateSuggestions(PyObject *dir, PyObject *name);
+extern PyObject* _Py_Offer_Suggestions(PyObject* exception);
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(Py_ssize_t) _Py_UTF8_Edit_Cost(PyObject *str_a, PyObject *str_b,
+                                          Py_ssize_t max_cost);
+
+void _PyErr_FormatNote(const char *format, ...);
+
+/* Context manipulation (PEP 3134) */
+
+Py_DEPRECATED(3.12) extern void _PyErr_ChainExceptions(PyObject *, PyObject *, PyObject *);
+
+// implementation detail for the codeop module.
+// Exported for test.test_peg_generator.test_c_parser
+PyAPI_DATA(PyTypeObject) _PyExc_IncompleteInputError;
+#define PyExc_IncompleteInputError ((PyObject *)(&_PyExc_IncompleteInputError))
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYERRORS_H */
diff --git a/Include/internal/pycore_pyhash.h b/Include/internal/pycore_pyhash.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ce08900e96f0b28b0f2c34a0151ccbe2906798b
--- /dev/null
+++ b/Include/internal/pycore_pyhash.h
@@ -0,0 +1,107 @@
+#ifndef Py_INTERNAL_PYHASH_H
+#define Py_INTERNAL_PYHASH_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Similar to Py_HashPointer(), but don't replace -1 with -2.
+static inline Py_hash_t
+_Py_HashPointerRaw(const void *ptr)
+{
+    uintptr_t x = (uintptr_t)ptr;
+    Py_BUILD_ASSERT(sizeof(x) == sizeof(ptr));
+
+    // Bottom 3 or 4 bits are likely to be 0; rotate x by 4 to the right
+    // to avoid excessive hash collisions for dicts and sets.
+    x = (x >> 4) | (x << (8 * sizeof(uintptr_t) - 4));
+
+    Py_BUILD_ASSERT(sizeof(x) == sizeof(Py_hash_t));
+    return (Py_hash_t)x;
+}
+
+// Export for '_datetime' shared extension
+PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t);
+
+/* Hash secret
+ *
+ * memory layout on 64 bit systems
+ *   cccccccc cccccccc cccccccc  uc -- unsigned char[24]
+ *   pppppppp ssssssss ........  fnv -- two Py_hash_t
+ *   k0k0k0k0 k1k1k1k1 ........  siphash -- two uint64_t
+ *   ........ ........ ssssssss  djbx33a -- 16 bytes padding + one Py_hash_t
+ *   ........ ........ eeeeeeee  pyexpat XML hash salt
+ *
+ * memory layout on 32 bit systems
+ *   cccccccc cccccccc cccccccc  uc
+ *   ppppssss ........ ........  fnv -- two Py_hash_t
+ *   k0k0k0k0 k1k1k1k1 ........  siphash -- two uint64_t (*)
+ *   ........ ........ ssss....  djbx33a -- 16 bytes padding + one Py_hash_t
+ *   ........ ........ eeee....  pyexpat XML hash salt
+ *
+ * (*) The siphash member may not be available on 32 bit platforms without
+ *     an unsigned int64 data type.
+ */
+typedef union {
+    /* ensure 24 bytes */
+    unsigned char uc[24];
+    /* two Py_hash_t for FNV */
+    struct {
+        Py_hash_t prefix;
+        Py_hash_t suffix;
+    } fnv;
+    /* two uint64 for SipHash24 */
+    struct {
+        uint64_t k0;
+        uint64_t k1;
+    } siphash;
+    /* a different (!) Py_hash_t for small string optimization */
+    struct {
+        unsigned char padding[16];
+        Py_hash_t suffix;
+    } djbx33a;
+    struct {
+        unsigned char padding[16];
+        Py_hash_t hashsalt;
+    } expat;
+} _Py_HashSecret_t;
+
+// Export for '_elementtree' shared extension
+PyAPI_DATA(_Py_HashSecret_t) _Py_HashSecret;
+
+#ifdef Py_DEBUG
+extern int _Py_HashSecret_Initialized;
+#endif
+
+
+struct pyhash_runtime_state {
+    struct {
+#ifndef MS_WINDOWS
+        int fd;
+        dev_t st_dev;
+        ino_t st_ino;
+#else
+    // This is a placeholder so the struct isn't empty on Windows.
+    int _not_used;
+#endif
+    } urandom_cache;
+};
+
+#ifndef MS_WINDOWS
+# define _py_urandom_cache_INIT \
+    { \
+        .fd = -1, \
+    }
+#else
+# define _py_urandom_cache_INIT {0}
+#endif
+
+#define pyhash_state_INIT \
+    { \
+        .urandom_cache = _py_urandom_cache_INIT, \
+    }
+
+
+extern uint64_t _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz);
+
+#endif  // !Py_INTERNAL_PYHASH_H
diff --git a/Include/internal/pycore_pylifecycle.h b/Include/internal/pycore_pylifecycle.h
new file mode 100644
index 0000000000000000000000000000000000000000..f426ae0e103b9c991229c1e1cc2c9c55ad12b715
--- /dev/null
+++ b/Include/internal/pycore_pylifecycle.h
@@ -0,0 +1,136 @@
+#ifndef Py_INTERNAL_LIFECYCLE_H
+#define Py_INTERNAL_LIFECYCLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_runtime.h"       // _PyRuntimeState
+
+/* Forward declarations */
+struct _PyArgv;
+struct pyruntimestate;
+
+extern int _Py_SetFileSystemEncoding(
+    const char *encoding,
+    const char *errors);
+extern void _Py_ClearFileSystemEncoding(void);
+extern PyStatus _PyUnicode_InitEncodings(PyThreadState *tstate);
+#ifdef MS_WINDOWS
+extern int _PyUnicode_EnableLegacyWindowsFSEncoding(void);
+#endif
+
+extern int _Py_IsLocaleCoercionTarget(const char *ctype_loc);
+
+/* Various one-time initializers */
+
+extern void _Py_InitVersion(void);
+extern PyStatus _PyFaulthandler_Init(int enable);
+extern PyObject * _PyBuiltin_Init(PyInterpreterState *interp);
+extern PyStatus _PySys_Create(
+    PyThreadState *tstate,
+    PyObject **sysmod_p);
+extern PyStatus _PySys_ReadPreinitWarnOptions(PyWideStringList *options);
+extern PyStatus _PySys_ReadPreinitXOptions(PyConfig *config);
+extern int _PySys_UpdateConfig(PyThreadState *tstate);
+extern void _PySys_FiniTypes(PyInterpreterState *interp);
+extern int _PyBuiltins_AddExceptions(PyObject * bltinmod);
+extern PyStatus _Py_HashRandomization_Init(const PyConfig *);
+
+extern PyStatus _PyGC_Init(PyInterpreterState *interp);
+extern PyStatus _PyAtExit_Init(PyInterpreterState *interp);
+
+/* Various internal finalizers */
+
+extern int _PySignal_Init(int install_signal_handlers);
+extern void _PySignal_Fini(void);
+
+extern void _PyGC_Fini(PyInterpreterState *interp);
+extern void _Py_HashRandomization_Fini(void);
+extern void _PyFaulthandler_Fini(void);
+extern void _PyHash_Fini(void);
+extern void _PyTraceMalloc_Fini(void);
+extern void _PyWarnings_Fini(PyInterpreterState *interp);
+extern void _PyAST_Fini(PyInterpreterState *interp);
+extern void _PyAtExit_Fini(PyInterpreterState *interp);
+extern void _PyThread_FiniType(PyInterpreterState *interp);
+extern void _PyArg_Fini(void);
+extern void _Py_FinalizeAllocatedBlocks(_PyRuntimeState *);
+
+extern PyStatus _PyGILState_Init(PyInterpreterState *interp);
+extern void _PyGILState_SetTstate(PyThreadState *tstate);
+extern void _PyGILState_Fini(PyInterpreterState *interp);
+
+extern void _PyGC_DumpShutdownStats(PyInterpreterState *interp);
+
+extern PyStatus _Py_PreInitializeFromPyArgv(
+    const PyPreConfig *src_config,
+    const struct _PyArgv *args);
+extern PyStatus _Py_PreInitializeFromConfig(
+    const PyConfig *config,
+    const struct _PyArgv *args);
+
+extern wchar_t * _Py_GetStdlibDir(void);
+
+extern int _Py_HandleSystemExit(int *exitcode_p);
+
+extern PyObject* _PyErr_WriteUnraisableDefaultHook(PyObject *unraisable);
+
+extern void _PyErr_Print(PyThreadState *tstate);
+extern void _PyErr_Display(PyObject *file, PyObject *exception,
+                                PyObject *value, PyObject *tb);
+extern void _PyErr_DisplayException(PyObject *file, PyObject *exc);
+
+extern void _PyThreadState_DeleteCurrent(PyThreadState *tstate);
+
+extern void _PyAtExit_Call(PyInterpreterState *interp);
+
+extern int _Py_IsCoreInitialized(void);
+
+extern int _Py_FdIsInteractive(FILE *fp, PyObject *filename);
+
+extern const char* _Py_gitidentifier(void);
+extern const char* _Py_gitversion(void);
+
+// Export for '_asyncio' shared extension
+PyAPI_FUNC(int) _Py_IsInterpreterFinalizing(PyInterpreterState *interp);
+
+/* Random */
+extern int _PyOS_URandom(void *buffer, Py_ssize_t size);
+
+// Export for '_random' shared extension
+PyAPI_FUNC(int) _PyOS_URandomNonblock(void *buffer, Py_ssize_t size);
+
+/* Legacy locale support */
+extern int _Py_CoerceLegacyLocale(int warn);
+extern int _Py_LegacyLocaleDetected(int warn);
+
+// Export for 'readline' shared extension
+PyAPI_FUNC(char*) _Py_SetLocaleFromEnv(int category);
+
+// Export for special main.c string compiling with source tracebacks
+int _PyRun_SimpleStringFlagsWithName(const char *command, const char* name, PyCompilerFlags *flags);
+
+
+/* interpreter config */
+
+// Export for _testinternalcapi shared extension
+PyAPI_FUNC(int) _PyInterpreterConfig_InitFromState(
+    PyInterpreterConfig *,
+    PyInterpreterState *);
+PyAPI_FUNC(PyObject *) _PyInterpreterConfig_AsDict(PyInterpreterConfig *);
+PyAPI_FUNC(int) _PyInterpreterConfig_InitFromDict(
+    PyInterpreterConfig *,
+    PyObject *);
+PyAPI_FUNC(int) _PyInterpreterConfig_UpdateFromDict(
+    PyInterpreterConfig *,
+    PyObject *);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_LIFECYCLE_H */
diff --git a/Include/internal/pycore_pymath.h b/Include/internal/pycore_pymath.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a4e1c1eb714f732a399baaecb81848b8d0f5527
--- /dev/null
+++ b/Include/internal/pycore_pymath.h
@@ -0,0 +1,205 @@
+#ifndef Py_INTERNAL_PYMATH_H
+#define Py_INTERNAL_PYMATH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* _Py_ADJUST_ERANGE1(x)
+ * _Py_ADJUST_ERANGE2(x, y)
+ * Set errno to 0 before calling a libm function, and invoke one of these
+ * macros after, passing the function result(s) (_Py_ADJUST_ERANGE2 is useful
+ * for functions returning complex results).  This makes two kinds of
+ * adjustments to errno:  (A) If it looks like the platform libm set
+ * errno=ERANGE due to underflow, clear errno. (B) If it looks like the
+ * platform libm overflowed but didn't set errno, force errno to ERANGE.  In
+ * effect, we're trying to force a useful implementation of C89 errno
+ * behavior.
+ * Caution:
+ *    This isn't reliable.  C99 no longer requires libm to set errno under
+ *        any exceptional condition, but does require +- HUGE_VAL return
+ *        values on overflow.  A 754 box *probably* maps HUGE_VAL to a
+ *        double infinity, and we're cool if that's so, unless the input
+ *        was an infinity and an infinity is the expected result.  A C89
+ *        system sets errno to ERANGE, so we check for that too.  We're
+ *        out of luck if a C99 754 box doesn't map HUGE_VAL to +Inf, or
+ *        if the returned result is a NaN, or if a C89 box returns HUGE_VAL
+ *        in non-overflow cases.
+ */
+static inline void _Py_ADJUST_ERANGE1(double x)
+{
+    if (errno == 0) {
+        if (x == Py_HUGE_VAL || x == -Py_HUGE_VAL) {
+            errno = ERANGE;
+        }
+    }
+    else if (errno == ERANGE && x == 0.0) {
+        errno = 0;
+    }
+}
+
+static inline void _Py_ADJUST_ERANGE2(double x, double y)
+{
+    if (x == Py_HUGE_VAL || x == -Py_HUGE_VAL ||
+        y == Py_HUGE_VAL || y == -Py_HUGE_VAL)
+    {
+        if (errno == 0) {
+            errno = ERANGE;
+        }
+    }
+    else if (errno == ERANGE) {
+        errno = 0;
+    }
+}
+
+
+//--- HAVE_PY_SET_53BIT_PRECISION macro ------------------------------------
+//
+// The functions _Py_dg_strtod() and _Py_dg_dtoa() in Python/dtoa.c (which are
+// required to support the short float repr introduced in Python 3.1) require
+// that the floating-point unit that's being used for arithmetic operations on
+// C doubles is set to use 53-bit precision.  It also requires that the FPU
+// rounding mode is round-half-to-even, but that's less often an issue.
+//
+// If your FPU isn't already set to 53-bit precision/round-half-to-even, and
+// you want to make use of _Py_dg_strtod() and _Py_dg_dtoa(), then you should:
+//
+//     #define HAVE_PY_SET_53BIT_PRECISION 1
+//
+// and also give appropriate definitions for the following three macros:
+//
+// * _Py_SET_53BIT_PRECISION_HEADER: any variable declarations needed to
+//   use the two macros below.
+// * _Py_SET_53BIT_PRECISION_START: store original FPU settings, and
+//   set FPU to 53-bit precision/round-half-to-even
+// * _Py_SET_53BIT_PRECISION_END: restore original FPU settings
+//
+// The macros are designed to be used within a single C function: see
+// Python/pystrtod.c for an example of their use.
+
+
+// Get and set x87 control word for gcc/x86
+#ifdef HAVE_GCC_ASM_FOR_X87
+#define HAVE_PY_SET_53BIT_PRECISION 1
+
+// Functions defined in Python/pymath.c
+extern unsigned short _Py_get_387controlword(void);
+extern void _Py_set_387controlword(unsigned short);
+
+#define _Py_SET_53BIT_PRECISION_HEADER                                  \
+    unsigned short old_387controlword, new_387controlword
+#define _Py_SET_53BIT_PRECISION_START                                   \
+    do {                                                                \
+        old_387controlword = _Py_get_387controlword();                  \
+        new_387controlword = (old_387controlword & ~0x0f00) | 0x0200;   \
+        if (new_387controlword != old_387controlword) {                 \
+            _Py_set_387controlword(new_387controlword);                 \
+        }                                                               \
+    } while (0)
+#define _Py_SET_53BIT_PRECISION_END                                     \
+    do {                                                                \
+        if (new_387controlword != old_387controlword) {                 \
+            _Py_set_387controlword(old_387controlword);                 \
+        }                                                               \
+    } while (0)
+#endif
+
+// Get and set x87 control word for VisualStudio/x86.
+// x87 is not supported in 64-bit or ARM.
+#if defined(_MSC_VER) && !defined(_WIN64) && !defined(_M_ARM)
+#define HAVE_PY_SET_53BIT_PRECISION 1
+
+#include <float.h>                // __control87_2()
+
+#define _Py_SET_53BIT_PRECISION_HEADER \
+    unsigned int old_387controlword, new_387controlword, out_387controlword
+    // We use the __control87_2 function to set only the x87 control word.
+    // The SSE control word is unaffected.
+#define _Py_SET_53BIT_PRECISION_START                                   \
+    do {                                                                \
+        __control87_2(0, 0, &old_387controlword, NULL);                 \
+        new_387controlword =                                            \
+          (old_387controlword & ~(_MCW_PC | _MCW_RC)) | (_PC_53 | _RC_NEAR); \
+        if (new_387controlword != old_387controlword) {                 \
+            __control87_2(new_387controlword, _MCW_PC | _MCW_RC,        \
+                          &out_387controlword, NULL);                   \
+        }                                                               \
+    } while (0)
+#define _Py_SET_53BIT_PRECISION_END                                     \
+    do {                                                                \
+        if (new_387controlword != old_387controlword) {                 \
+            __control87_2(old_387controlword, _MCW_PC | _MCW_RC,        \
+                          &out_387controlword, NULL);                   \
+        }                                                               \
+    } while (0)
+#endif
+
+
+// MC68881
+#ifdef HAVE_GCC_ASM_FOR_MC68881
+#define HAVE_PY_SET_53BIT_PRECISION 1
+#define _Py_SET_53BIT_PRECISION_HEADER \
+    unsigned int old_fpcr, new_fpcr
+#define _Py_SET_53BIT_PRECISION_START                                   \
+    do {                                                                \
+        __asm__ ("fmove.l %%fpcr,%0" : "=g" (old_fpcr));                \
+        /* Set double precision / round to nearest.  */                 \
+        new_fpcr = (old_fpcr & ~0xf0) | 0x80;                           \
+        if (new_fpcr != old_fpcr) {                                     \
+              __asm__ volatile ("fmove.l %0,%%fpcr" : : "g" (new_fpcr));\
+        }                                                               \
+    } while (0)
+#define _Py_SET_53BIT_PRECISION_END                                     \
+    do {                                                                \
+        if (new_fpcr != old_fpcr) {                                     \
+            __asm__ volatile ("fmove.l %0,%%fpcr" : : "g" (old_fpcr));  \
+        }                                                               \
+    } while (0)
+#endif
+
+// Default definitions are empty
+#ifndef _Py_SET_53BIT_PRECISION_HEADER
+#  define _Py_SET_53BIT_PRECISION_HEADER
+#  define _Py_SET_53BIT_PRECISION_START
+#  define _Py_SET_53BIT_PRECISION_END
+#endif
+
+
+//--- _PY_SHORT_FLOAT_REPR macro -------------------------------------------
+
+// If we can't guarantee 53-bit precision, don't use the code
+// in Python/dtoa.c, but fall back to standard code.  This
+// means that repr of a float will be long (17 significant digits).
+//
+// Realistically, there are two things that could go wrong:
+//
+// (1) doubles aren't IEEE 754 doubles, or
+// (2) we're on x86 with the rounding precision set to 64-bits
+//     (extended precision), and we don't know how to change
+//     the rounding precision.
+#if !defined(DOUBLE_IS_LITTLE_ENDIAN_IEEE754) && \
+    !defined(DOUBLE_IS_BIG_ENDIAN_IEEE754) && \
+    !defined(DOUBLE_IS_ARM_MIXED_ENDIAN_IEEE754)
+#  define _PY_SHORT_FLOAT_REPR 0
+#endif
+
+// Double rounding is symptomatic of use of extended precision on x86.
+// If we're seeing double rounding, and we don't have any mechanism available
+// for changing the FPU rounding precision, then don't use Python/dtoa.c.
+#if defined(X87_DOUBLE_ROUNDING) && !defined(HAVE_PY_SET_53BIT_PRECISION)
+#  define _PY_SHORT_FLOAT_REPR 0
+#endif
+
+#ifndef _PY_SHORT_FLOAT_REPR
+#  define _PY_SHORT_FLOAT_REPR 1
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYMATH_H */
diff --git a/Include/internal/pycore_pymem.h b/Include/internal/pycore_pymem.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9593dbff1cc60aeb2ac5ee59a04f41b8a7fee5c
--- /dev/null
+++ b/Include/internal/pycore_pymem.h
@@ -0,0 +1,138 @@
+#ifndef Py_INTERNAL_PYMEM_H
+#define Py_INTERNAL_PYMEM_H
+
+#include "pycore_llist.h"           // struct llist_node
+#include "pycore_lock.h"            // PyMutex
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Try to get the allocators name set by _PyMem_SetupAllocators().
+// Return NULL if unknown.
+// Export for '_testinternalcapi' shared extension.
+PyAPI_FUNC(const char*) _PyMem_GetCurrentAllocatorName(void);
+
+// strdup() using PyMem_RawMalloc()
+extern char* _PyMem_RawStrdup(const char *str);
+
+// strdup() using PyMem_Malloc().
+// Export for '_pickle ' shared extension.
+PyAPI_FUNC(char*) _PyMem_Strdup(const char *str);
+
+// wcsdup() using PyMem_RawMalloc()
+extern wchar_t* _PyMem_RawWcsdup(const wchar_t *str);
+
+typedef struct {
+    /* We tag each block with an API ID in order to tag API violations */
+    char api_id;
+    PyMemAllocatorEx alloc;
+} debug_alloc_api_t;
+
+struct _pymem_allocators {
+    PyMutex mutex;
+    struct {
+        PyMemAllocatorEx raw;
+        PyMemAllocatorEx mem;
+        PyMemAllocatorEx obj;
+    } standard;
+    struct {
+        debug_alloc_api_t raw;
+        debug_alloc_api_t mem;
+        debug_alloc_api_t obj;
+    } debug;
+    int is_debug_enabled;
+    PyObjectArenaAllocator obj_arena;
+};
+
+struct _Py_mem_interp_free_queue {
+    int has_work;   // true if the queue is not empty
+    PyMutex mutex;  // protects the queue
+    struct llist_node head;  // queue of _mem_work_chunk items
+};
+
+/* Set the memory allocator of the specified domain to the default.
+   Save the old allocator into *old_alloc if it's non-NULL.
+   Return on success, or return -1 if the domain is unknown. */
+extern int _PyMem_SetDefaultAllocator(
+    PyMemAllocatorDomain domain,
+    PyMemAllocatorEx *old_alloc);
+
+/* Special bytes broadcast into debug memory blocks at appropriate times.
+   Strings of these are unlikely to be valid addresses, floats, ints or
+   7-bit ASCII.
+
+   - PYMEM_CLEANBYTE: clean (newly allocated) memory
+   - PYMEM_DEADBYTE dead (newly freed) memory
+   - PYMEM_FORBIDDENBYTE: untouchable bytes at each end of a block
+
+   Byte patterns 0xCB, 0xDB and 0xFB have been replaced with 0xCD, 0xDD and
+   0xFD to use the same values as Windows CRT debug malloc() and free().
+   If modified, _PyMem_IsPtrFreed() should be updated as well. */
+#define PYMEM_CLEANBYTE      0xCD
+#define PYMEM_DEADBYTE       0xDD
+#define PYMEM_FORBIDDENBYTE  0xFD
+
+/* Heuristic checking if a pointer value is newly allocated
+   (uninitialized), newly freed or NULL (is equal to zero).
+
+   The pointer is not dereferenced, only the pointer value is checked.
+
+   The heuristic relies on the debug hooks on Python memory allocators which
+   fills newly allocated memory with CLEANBYTE (0xCD) and newly freed memory
+   with DEADBYTE (0xDD). Detect also "untouchable bytes" marked
+   with FORBIDDENBYTE (0xFD). */
+static inline int _PyMem_IsPtrFreed(const void *ptr)
+{
+    uintptr_t value = (uintptr_t)ptr;
+#if SIZEOF_VOID_P == 8
+    return (value == 0
+            || value == (uintptr_t)0xCDCDCDCDCDCDCDCD
+            || value == (uintptr_t)0xDDDDDDDDDDDDDDDD
+            || value == (uintptr_t)0xFDFDFDFDFDFDFDFD);
+#elif SIZEOF_VOID_P == 4
+    return (value == 0
+            || value == (uintptr_t)0xCDCDCDCD
+            || value == (uintptr_t)0xDDDDDDDD
+            || value == (uintptr_t)0xFDFDFDFD);
+#else
+#  error "unknown pointer size"
+#endif
+}
+
+extern int _PyMem_GetAllocatorName(
+    const char *name,
+    PyMemAllocatorName *allocator);
+
+/* Configure the Python memory allocators.
+   Pass PYMEM_ALLOCATOR_DEFAULT to use default allocators.
+   PYMEM_ALLOCATOR_NOT_SET does nothing. */
+extern int _PyMem_SetupAllocators(PyMemAllocatorName allocator);
+
+/* Is the debug allocator enabled? */
+extern int _PyMem_DebugEnabled(void);
+
+// Enqueue a pointer to be freed possibly after some delay.
+extern void _PyMem_FreeDelayed(void *ptr, size_t size);
+
+// Enqueue an object to be freed possibly after some delay
+extern void _PyObject_FreeDelayed(void *ptr);
+
+// Periodically process delayed free requests.
+extern void _PyMem_ProcessDelayed(PyThreadState *tstate);
+
+// Abandon all thread-local delayed free requests and push them to the
+// interpreter's queue.
+extern void _PyMem_AbandonDelayed(PyThreadState *tstate);
+
+// On interpreter shutdown, frees all delayed free requests.
+extern void _PyMem_FiniDelayed(PyInterpreterState *interp);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_PYMEM_H
diff --git a/Include/internal/pycore_pymem_init.h b/Include/internal/pycore_pymem_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..c593edc86d9952aa9b001006122460ee6f215c44
--- /dev/null
+++ b/Include/internal/pycore_pymem_init.h
@@ -0,0 +1,103 @@
+#ifndef Py_INTERNAL_PYMEM_INIT_H
+#define Py_INTERNAL_PYMEM_INIT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/********************************/
+/* the allocators' initializers */
+
+extern void * _PyMem_RawMalloc(void *, size_t);
+extern void * _PyMem_RawCalloc(void *, size_t, size_t);
+extern void * _PyMem_RawRealloc(void *, void *, size_t);
+extern void _PyMem_RawFree(void *, void *);
+#define PYRAW_ALLOC {NULL, _PyMem_RawMalloc, _PyMem_RawCalloc, _PyMem_RawRealloc, _PyMem_RawFree}
+
+#ifdef Py_GIL_DISABLED
+// Py_GIL_DISABLED requires mimalloc
+extern void* _PyObject_MiMalloc(void *, size_t);
+extern void* _PyObject_MiCalloc(void *, size_t, size_t);
+extern void _PyObject_MiFree(void *, void *);
+extern void* _PyObject_MiRealloc(void *, void *, size_t);
+#  define PYOBJ_ALLOC {NULL, _PyObject_MiMalloc, _PyObject_MiCalloc, _PyObject_MiRealloc, _PyObject_MiFree}
+extern void* _PyMem_MiMalloc(void *, size_t);
+extern void* _PyMem_MiCalloc(void *, size_t, size_t);
+extern void _PyMem_MiFree(void *, void *);
+extern void* _PyMem_MiRealloc(void *, void *, size_t);
+#  define PYMEM_ALLOC {NULL, _PyMem_MiMalloc, _PyMem_MiCalloc, _PyMem_MiRealloc, _PyMem_MiFree}
+#elif defined(WITH_PYMALLOC)
+extern void* _PyObject_Malloc(void *, size_t);
+extern void* _PyObject_Calloc(void *, size_t, size_t);
+extern void _PyObject_Free(void *, void *);
+extern void* _PyObject_Realloc(void *, void *, size_t);
+#  define PYOBJ_ALLOC {NULL, _PyObject_Malloc, _PyObject_Calloc, _PyObject_Realloc, _PyObject_Free}
+#  define PYMEM_ALLOC PYOBJ_ALLOC
+#else
+# define PYOBJ_ALLOC PYRAW_ALLOC
+# define PYMEM_ALLOC PYOBJ_ALLOC
+#endif  // WITH_PYMALLOC
+
+
+extern void* _PyMem_DebugRawMalloc(void *, size_t);
+extern void* _PyMem_DebugRawCalloc(void *, size_t, size_t);
+extern void* _PyMem_DebugRawRealloc(void *, void *, size_t);
+extern void _PyMem_DebugRawFree(void *, void *);
+
+extern void* _PyMem_DebugMalloc(void *, size_t);
+extern void* _PyMem_DebugCalloc(void *, size_t, size_t);
+extern void* _PyMem_DebugRealloc(void *, void *, size_t);
+extern void _PyMem_DebugFree(void *, void *);
+
+#define PYDBGRAW_ALLOC(runtime) \
+    {&(runtime).allocators.debug.raw, _PyMem_DebugRawMalloc, _PyMem_DebugRawCalloc, _PyMem_DebugRawRealloc, _PyMem_DebugRawFree}
+#define PYDBGMEM_ALLOC(runtime) \
+    {&(runtime).allocators.debug.mem, _PyMem_DebugMalloc, _PyMem_DebugCalloc, _PyMem_DebugRealloc, _PyMem_DebugFree}
+#define PYDBGOBJ_ALLOC(runtime) \
+    {&(runtime).allocators.debug.obj, _PyMem_DebugMalloc, _PyMem_DebugCalloc, _PyMem_DebugRealloc, _PyMem_DebugFree}
+
+extern void * _PyMem_ArenaAlloc(void *, size_t);
+extern void _PyMem_ArenaFree(void *, void *, size_t);
+
+#ifdef Py_DEBUG
+# define _pymem_allocators_standard_INIT(runtime) \
+    { \
+        PYDBGRAW_ALLOC(runtime), \
+        PYDBGMEM_ALLOC(runtime), \
+        PYDBGOBJ_ALLOC(runtime), \
+    }
+# define _pymem_is_debug_enabled_INIT 1
+#else
+# define _pymem_allocators_standard_INIT(runtime) \
+    { \
+        PYRAW_ALLOC, \
+        PYMEM_ALLOC, \
+        PYOBJ_ALLOC, \
+    }
+# define _pymem_is_debug_enabled_INIT 0
+#endif
+
+#define _pymem_allocators_debug_INIT \
+   { \
+       {'r', PYRAW_ALLOC}, \
+       {'m', PYMEM_ALLOC}, \
+       {'o', PYOBJ_ALLOC}, \
+   }
+
+#  define _pymem_allocators_obj_arena_INIT \
+    { NULL, _PyMem_ArenaAlloc, _PyMem_ArenaFree }
+
+
+#define _Py_mem_free_queue_INIT(queue) \
+    { \
+        .head = LLIST_INIT(queue.head), \
+    }
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_PYMEM_INIT_H
diff --git a/Include/internal/pycore_pystate.h b/Include/internal/pycore_pystate.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0e72523f58ed8ec5aadc6c7844b652aa30d727c
--- /dev/null
+++ b/Include/internal/pycore_pystate.h
@@ -0,0 +1,299 @@
+#ifndef Py_INTERNAL_PYSTATE_H
+#define Py_INTERNAL_PYSTATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_freelist.h"      // _PyFreeListState
+#include "pycore_runtime.h"       // _PyRuntime
+#include "pycore_tstate.h"        // _PyThreadStateImpl
+
+
+// Values for PyThreadState.state. A thread must be in the "attached" state
+// before calling most Python APIs. If the GIL is enabled, then "attached"
+// implies that the thread holds the GIL and "detached" implies that the
+// thread does not hold the GIL (or is in the process of releasing it). In
+// `--disable-gil` builds, multiple threads may be "attached" to the same
+// interpreter at the same time. Only the "bound" thread may perform the
+// transitions between "attached" and "detached" on its own PyThreadState.
+//
+// The "suspended" state is used to implement stop-the-world pauses, such as
+// for cyclic garbage collection. It is only used in `--disable-gil` builds.
+// The "suspended" state is similar to the "detached" state in that in both
+// states the thread is not allowed to call most Python APIs. However, unlike
+// the "detached" state, a thread may not transition itself out from the
+// "suspended" state. Only the thread performing a stop-the-world pause may
+// transition a thread from the "suspended" state back to the "detached" state.
+//
+// State transition diagram:
+//
+//            (bound thread)        (stop-the-world thread)
+// [attached]       <->       [detached]       <->       [suspended]
+//   |                                                        ^
+//   +---------------------------->---------------------------+
+//                          (bound thread)
+//
+// The (bound thread) and (stop-the-world thread) labels indicate which thread
+// is allowed to perform the transition.
+#define _Py_THREAD_DETACHED     0
+#define _Py_THREAD_ATTACHED     1
+#define _Py_THREAD_SUSPENDED    2
+
+
+/* Check if the current thread is the main thread.
+   Use _Py_IsMainInterpreter() to check if it's the main interpreter. */
+static inline int
+_Py_IsMainThread(void)
+{
+    unsigned long thread = PyThread_get_thread_ident();
+    return (thread == _PyRuntime.main_thread);
+}
+
+
+static inline PyInterpreterState *
+_PyInterpreterState_Main(void)
+{
+    return _PyRuntime.interpreters.main;
+}
+
+static inline int
+_Py_IsMainInterpreter(PyInterpreterState *interp)
+{
+    return (interp == _PyInterpreterState_Main());
+}
+
+static inline int
+_Py_IsMainInterpreterFinalizing(PyInterpreterState *interp)
+{
+    /* bpo-39877: Access _PyRuntime directly rather than using
+       tstate->interp->runtime to support calls from Python daemon threads.
+       After Py_Finalize() has been called, tstate can be a dangling pointer:
+       point to PyThreadState freed memory. */
+    return (_PyRuntimeState_GetFinalizing(&_PyRuntime) != NULL &&
+            interp == &_PyRuntime._main_interpreter);
+}
+
+// Export for _interpreters module.
+PyAPI_FUNC(PyObject *) _PyInterpreterState_GetIDObject(PyInterpreterState *);
+
+// Export for _interpreters module.
+PyAPI_FUNC(int) _PyInterpreterState_SetRunningMain(PyInterpreterState *);
+PyAPI_FUNC(void) _PyInterpreterState_SetNotRunningMain(PyInterpreterState *);
+PyAPI_FUNC(int) _PyInterpreterState_IsRunningMain(PyInterpreterState *);
+PyAPI_FUNC(int) _PyInterpreterState_FailIfRunningMain(PyInterpreterState *);
+
+extern int _PyThreadState_IsRunningMain(PyThreadState *);
+extern void _PyInterpreterState_ReinitRunningMain(PyThreadState *);
+
+
+static inline const PyConfig *
+_Py_GetMainConfig(void)
+{
+    PyInterpreterState *interp = _PyInterpreterState_Main();
+    if (interp == NULL) {
+        return NULL;
+    }
+    return _PyInterpreterState_GetConfig(interp);
+}
+
+
+/* Only handle signals on the main thread of the main interpreter. */
+static inline int
+_Py_ThreadCanHandleSignals(PyInterpreterState *interp)
+{
+    return (_Py_IsMainThread() && _Py_IsMainInterpreter(interp));
+}
+
+
+/* Variable and static inline functions for in-line access to current thread
+   and interpreter state */
+
+#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE)
+extern _Py_thread_local PyThreadState *_Py_tss_tstate;
+#endif
+
+#ifndef NDEBUG
+extern int _PyThreadState_CheckConsistency(PyThreadState *tstate);
+#endif
+
+int _PyThreadState_MustExit(PyThreadState *tstate);
+
+// Export for most shared extensions, used via _PyThreadState_GET() static
+// inline function.
+PyAPI_FUNC(PyThreadState *) _PyThreadState_GetCurrent(void);
+
+/* Get the current Python thread state.
+
+   This function is unsafe: it does not check for error and it can return NULL.
+
+   The caller must hold the GIL.
+
+   See also PyThreadState_Get() and PyThreadState_GetUnchecked(). */
+static inline PyThreadState*
+_PyThreadState_GET(void)
+{
+#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE)
+    return _Py_tss_tstate;
+#else
+    return _PyThreadState_GetCurrent();
+#endif
+}
+
+// Attaches the current thread to the interpreter.
+//
+// This may block while acquiring the GIL (if the GIL is enabled) or while
+// waiting for a stop-the-world pause (if the GIL is disabled).
+//
+// High-level code should generally call PyEval_RestoreThread() instead, which
+// calls this function.
+extern void _PyThreadState_Attach(PyThreadState *tstate);
+
+// Detaches the current thread from the interpreter.
+//
+// High-level code should generally call PyEval_SaveThread() instead, which
+// calls this function.
+extern void _PyThreadState_Detach(PyThreadState *tstate);
+
+// Detaches the current thread to the "suspended" state if a stop-the-world
+// pause is in progress.
+//
+// If there is no stop-the-world pause in progress, then the thread switches
+// to the "detached" state.
+extern void _PyThreadState_Suspend(PyThreadState *tstate);
+
+// Perform a stop-the-world pause for all threads in the all interpreters.
+//
+// Threads in the "attached" state are paused and transitioned to the "GC"
+// state. Threads in the "detached" state switch to the "GC" state, preventing
+// them from reattaching until the stop-the-world pause is complete.
+//
+// NOTE: This is a no-op outside of Py_GIL_DISABLED builds.
+extern void _PyEval_StopTheWorldAll(_PyRuntimeState *runtime);
+extern void _PyEval_StartTheWorldAll(_PyRuntimeState *runtime);
+
+// Perform a stop-the-world pause for threads in the specified interpreter.
+//
+// NOTE: This is a no-op outside of Py_GIL_DISABLED builds.
+extern void _PyEval_StopTheWorld(PyInterpreterState *interp);
+extern void _PyEval_StartTheWorld(PyInterpreterState *interp);
+
+
+static inline void
+_Py_EnsureFuncTstateNotNULL(const char *func, PyThreadState *tstate)
+{
+    if (tstate == NULL) {
+        _Py_FatalErrorFunc(func,
+            "the function must be called with the GIL held, "
+            "after Python initialization and before Python finalization, "
+            "but the GIL is released (the current Python thread state is NULL)");
+    }
+}
+
+// Call Py_FatalError() if tstate is NULL
+#define _Py_EnsureTstateNotNULL(tstate) \
+    _Py_EnsureFuncTstateNotNULL(__func__, (tstate))
+
+
+/* Get the current interpreter state.
+
+   The function is unsafe: it does not check for error and it can return NULL.
+
+   The caller must hold the GIL.
+
+   See also PyInterpreterState_Get()
+   and _PyGILState_GetInterpreterStateUnsafe(). */
+static inline PyInterpreterState* _PyInterpreterState_GET(void) {
+    PyThreadState *tstate = _PyThreadState_GET();
+#ifdef Py_DEBUG
+    _Py_EnsureTstateNotNULL(tstate);
+#endif
+    return tstate->interp;
+}
+
+
+// PyThreadState functions
+
+// Export for _testinternalcapi
+PyAPI_FUNC(PyThreadState *) _PyThreadState_New(
+    PyInterpreterState *interp,
+    int whence);
+extern void _PyThreadState_Bind(PyThreadState *tstate);
+PyAPI_FUNC(PyThreadState *) _PyThreadState_NewBound(
+    PyInterpreterState *interp,
+    int whence);
+extern PyThreadState * _PyThreadState_RemoveExcept(PyThreadState *tstate);
+extern void _PyThreadState_DeleteList(PyThreadState *list);
+extern void _PyThreadState_ClearMimallocHeaps(PyThreadState *tstate);
+
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(PyObject*) _PyThreadState_GetDict(PyThreadState *tstate);
+
+/* The implementation of sys._current_exceptions()  Returns a dict mapping
+   thread id to that thread's current exception.
+*/
+extern PyObject* _PyThread_CurrentExceptions(void);
+
+
+/* Other */
+
+extern PyThreadState * _PyThreadState_Swap(
+    _PyRuntimeState *runtime,
+    PyThreadState *newts);
+
+extern PyStatus _PyInterpreterState_Enable(_PyRuntimeState *runtime);
+
+#ifdef HAVE_FORK
+extern PyStatus _PyInterpreterState_DeleteExceptMain(_PyRuntimeState *runtime);
+extern void _PySignal_AfterFork(void);
+#endif
+
+// Export for the stable ABI
+PyAPI_FUNC(int) _PyState_AddModule(
+    PyThreadState *tstate,
+    PyObject* module,
+    PyModuleDef* def);
+
+
+extern int _PyOS_InterruptOccurred(PyThreadState *tstate);
+
+#define HEAD_LOCK(runtime) \
+    PyMutex_LockFlags(&(runtime)->interpreters.mutex, _Py_LOCK_DONT_DETACH)
+#define HEAD_UNLOCK(runtime) \
+    PyMutex_Unlock(&(runtime)->interpreters.mutex)
+
+// Get the configuration of the current interpreter.
+// The caller must hold the GIL.
+// Export for test_peg_generator.
+PyAPI_FUNC(const PyConfig*) _Py_GetConfig(void);
+
+// Get the single PyInterpreterState used by this process' GILState
+// implementation.
+//
+// This function doesn't check for error. Return NULL before _PyGILState_Init()
+// is called and after _PyGILState_Fini() is called.
+//
+// See also PyInterpreterState_Get() and _PyInterpreterState_GET().
+extern PyInterpreterState* _PyGILState_GetInterpreterStateUnsafe(void);
+
+static inline struct _Py_object_freelists* _Py_object_freelists_GET(void)
+{
+    PyThreadState *tstate = _PyThreadState_GET();
+#ifdef Py_DEBUG
+    _Py_EnsureTstateNotNULL(tstate);
+#endif
+
+#ifdef Py_GIL_DISABLED
+    return &((_PyThreadStateImpl*)tstate)->freelists;
+#else
+    return &tstate->interp->object_state.freelists;
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYSTATE_H */
diff --git a/Include/internal/pycore_pystats.h b/Include/internal/pycore_pystats.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8af398a56058619c35842f2a4f5c6f9a9dada97
--- /dev/null
+++ b/Include/internal/pycore_pystats.h
@@ -0,0 +1,21 @@
+#ifndef Py_INTERNAL_PYSTATS_H
+#define Py_INTERNAL_PYSTATS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef Py_STATS
+extern void _Py_StatsOn(void);
+extern void _Py_StatsOff(void);
+extern void _Py_StatsClear(void);
+extern int _Py_PrintSpecializationStats(int to_file);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_PYSTATS_H
diff --git a/Include/internal/pycore_pythonrun.h b/Include/internal/pycore_pythonrun.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bfc5704dc4c5948f696a962425c676fc866010e
--- /dev/null
+++ b/Include/internal/pycore_pythonrun.h
@@ -0,0 +1,39 @@
+#ifndef Py_INTERNAL_PYTHONRUN_H
+#define Py_INTERNAL_PYTHONRUN_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern int _PyRun_SimpleFileObject(
+    FILE *fp,
+    PyObject *filename,
+    int closeit,
+    PyCompilerFlags *flags);
+
+extern int _PyRun_AnyFileObject(
+    FILE *fp,
+    PyObject *filename,
+    int closeit,
+    PyCompilerFlags *flags);
+
+extern int _PyRun_InteractiveLoopObject(
+    FILE *fp,
+    PyObject *filename,
+    PyCompilerFlags *flags);
+
+extern const char* _Py_SourceAsString(
+    PyObject *cmd,
+    const char *funcname,
+    const char *what,
+    PyCompilerFlags *cf,
+    PyObject **cmd_copy);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_PYTHONRUN_H
+
diff --git a/Include/internal/pycore_pythread.h b/Include/internal/pycore_pythread.h
new file mode 100644
index 0000000000000000000000000000000000000000..3610c6254db6af36e8dc1f7e485f7b8ad595e35d
--- /dev/null
+++ b/Include/internal/pycore_pythread.h
@@ -0,0 +1,159 @@
+#ifndef Py_INTERNAL_PYTHREAD_H
+#define Py_INTERNAL_PYTHREAD_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "dynamic_annotations.h" // _Py_ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX
+#include "pycore_llist.h"        // struct llist_node
+
+// Get _POSIX_THREADS and _POSIX_SEMAPHORES macros if available
+#if (defined(HAVE_UNISTD_H) && !defined(_POSIX_THREADS) \
+                            && !defined(_POSIX_SEMAPHORES))
+#  include <unistd.h>             // _POSIX_THREADS, _POSIX_SEMAPHORES
+#endif
+#if (defined(HAVE_PTHREAD_H) && !defined(_POSIX_THREADS) \
+                             && !defined(_POSIX_SEMAPHORES))
+   // This means pthreads are not implemented in libc headers, hence the macro
+   // not present in <unistd.h>. But they still can be implemented as an
+   // external library (e.g. gnu pth in pthread emulation)
+#  include <pthread.h>            // _POSIX_THREADS, _POSIX_SEMAPHORES
+#endif
+#if !defined(_POSIX_THREADS) && defined(__hpux) && defined(_SC_THREADS)
+   // Check if we're running on HP-UX and _SC_THREADS is defined. If so, then
+   // enough of the POSIX threads package is implemented to support Python
+   // threads.
+   //
+   // This is valid for HP-UX 11.23 running on an ia64 system. If needed, add
+   // a check of __ia64 to verify that we're running on an ia64 system instead
+   // of a pa-risc system.
+#  define _POSIX_THREADS
+#endif
+
+
+#if defined(_POSIX_THREADS) || defined(HAVE_PTHREAD_STUBS)
+#  define _USE_PTHREADS
+#endif
+
+#if defined(_USE_PTHREADS) && defined(HAVE_PTHREAD_CONDATTR_SETCLOCK) && defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_MONOTONIC)
+// monotonic is supported statically.  It doesn't mean it works on runtime.
+#  define CONDATTR_MONOTONIC
+#endif
+
+
+#if defined(HAVE_PTHREAD_STUBS)
+#include "cpython/pthread_stubs.h"  // PTHREAD_KEYS_MAX
+#include <stdbool.h>                // bool
+
+// pthread_key
+struct py_stub_tls_entry {
+    bool in_use;
+    void *value;
+};
+#endif
+
+struct _pythread_runtime_state {
+    int initialized;
+
+#ifdef _USE_PTHREADS
+    // This matches when thread_pthread.h is used.
+    struct {
+        /* NULL when pthread_condattr_setclock(CLOCK_MONOTONIC) is not supported. */
+        pthread_condattr_t *ptr;
+# ifdef CONDATTR_MONOTONIC
+    /* The value to which condattr_monotonic is set. */
+        pthread_condattr_t val;
+# endif
+    } _condattr_monotonic;
+
+#endif  // USE_PTHREADS
+
+#if defined(HAVE_PTHREAD_STUBS)
+    struct {
+        struct py_stub_tls_entry tls_entries[PTHREAD_KEYS_MAX];
+    } stubs;
+#endif
+
+    // Linked list of ThreadHandles
+    struct llist_node handles;
+};
+
+#define _pythread_RUNTIME_INIT(pythread) \
+    { \
+        .handles = LLIST_INIT(pythread.handles), \
+    }
+
+#ifdef HAVE_FORK
+/* Private function to reinitialize a lock at fork in the child process.
+   Reset the lock to the unlocked state.
+   Return 0 on success, return -1 on error. */
+extern int _PyThread_at_fork_reinit(PyThread_type_lock *lock);
+extern void _PyThread_AfterFork(struct _pythread_runtime_state *state);
+#endif  /* HAVE_FORK */
+
+
+// unset: -1 seconds, in nanoseconds
+#define PyThread_UNSET_TIMEOUT ((PyTime_t)(-1 * 1000 * 1000 * 1000))
+
+// Exported for the _interpchannels module.
+PyAPI_FUNC(int) PyThread_ParseTimeoutArg(
+    PyObject *arg,
+    int blocking,
+    PY_TIMEOUT_T *timeout);
+
+/* Helper to acquire an interruptible lock with a timeout.  If the lock acquire
+ * is interrupted, signal handlers are run, and if they raise an exception,
+ * PY_LOCK_INTR is returned.  Otherwise, PY_LOCK_ACQUIRED or PY_LOCK_FAILURE
+ * are returned, depending on whether the lock can be acquired within the
+ * timeout.
+ */
+// Exported for the _interpchannels module.
+PyAPI_FUNC(PyLockStatus) PyThread_acquire_lock_timed_with_retries(
+    PyThread_type_lock,
+    PY_TIMEOUT_T microseconds);
+
+typedef unsigned long long PyThread_ident_t;
+typedef Py_uintptr_t PyThread_handle_t;
+
+#define PY_FORMAT_THREAD_IDENT_T "llu"
+#define Py_PARSE_THREAD_IDENT_T "K"
+
+PyAPI_FUNC(PyThread_ident_t) PyThread_get_thread_ident_ex(void);
+
+/* Thread joining APIs.
+ *
+ * These APIs have a strict contract:
+ *  - Either PyThread_join_thread or PyThread_detach_thread must be called
+ *    exactly once with the given handle.
+ *  - Calling neither PyThread_join_thread nor PyThread_detach_thread results
+ *    in a resource leak until the end of the process.
+ *  - Any other usage, such as calling both PyThread_join_thread and
+ *    PyThread_detach_thread, or calling them more than once (including
+ *    simultaneously), results in undefined behavior.
+ */
+PyAPI_FUNC(int) PyThread_start_joinable_thread(void (*func)(void *),
+                                               void *arg,
+                                               PyThread_ident_t* ident,
+                                               PyThread_handle_t* handle);
+/*
+ * Join a thread started with `PyThread_start_joinable_thread`.
+ * This function cannot be interrupted. It returns 0 on success,
+ * a non-zero value on failure.
+ */
+PyAPI_FUNC(int) PyThread_join_thread(PyThread_handle_t);
+/*
+ * Detach a thread started with `PyThread_start_joinable_thread`, such
+ * that its resources are relased as soon as it exits.
+ * This function cannot be interrupted. It returns 0 on success,
+ * a non-zero value on failure.
+ */
+PyAPI_FUNC(int) PyThread_detach_thread(PyThread_handle_t);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYTHREAD_H */
diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h
new file mode 100644
index 0000000000000000000000000000000000000000..84e9d98dd21bda1460b0ecfd68a71c630f865ba8
--- /dev/null
+++ b/Include/internal/pycore_qsbr.h
@@ -0,0 +1,173 @@
+// The QSBR APIs (quiescent state-based reclamation) provide a mechanism for
+// the free-threaded build to safely reclaim memory when there may be
+// concurrent accesses.
+//
+// Many operations in the free-threaded build are protected by locks. However,
+// in some cases, we want to allow reads to happen concurrently with updates.
+// In this case, we need to delay freeing ("reclaiming") any memory that may be
+// concurrently accessed by a reader. The QSBR APIs provide a way to do this.
+#ifndef Py_INTERNAL_QSBR_H
+#define Py_INTERNAL_QSBR_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "pycore_lock.h"        // PyMutex
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// The shared write sequence is always odd and incremented by two. Detached
+// threads are indicated by a read sequence of zero. This avoids collisions
+// between the offline state and any valid sequence number even if the
+// sequences numbers wrap around.
+#define QSBR_OFFLINE 0
+#define QSBR_INITIAL 1
+#define QSBR_INCR    2
+
+// Wrap-around safe comparison. This is a holdover from the FreeBSD
+// implementation, which uses 32-bit sequence numbers. We currently use 64-bit
+// sequence numbers, so wrap-around is unlikely.
+#define QSBR_LT(a, b) ((int64_t)((a)-(b)) < 0)
+#define QSBR_LEQ(a, b) ((int64_t)((a)-(b)) <= 0)
+
+struct _qsbr_shared;
+struct _PyThreadStateImpl;  // forward declare to avoid circular dependency
+
+// Per-thread state
+struct _qsbr_thread_state {
+    // Last observed write sequence (or 0 if detached)
+    uint64_t seq;
+
+    // Shared (per-interpreter) QSBR state
+    struct _qsbr_shared *shared;
+
+    // Thread state (or NULL)
+    PyThreadState *tstate;
+
+    // Number of held items added by this thread since the last write sequence
+    // advance
+    int deferred_count;
+
+    // Estimate for the amount of memory that is held by this thread since
+    // the last write sequence advance
+    size_t deferred_memory;
+
+    // Amount of memory in mimalloc pages deferred from collection.  When
+    // deferred, they are prevented from being used for a different size class
+    // and in a different thread.
+    size_t deferred_page_memory;
+
+    // True if the deferred memory frees should be processed.
+    bool should_process;
+
+    // Is this thread state allocated?
+    bool allocated;
+    struct _qsbr_thread_state *freelist_next;
+};
+
+// Padding to avoid false sharing
+struct _qsbr_pad {
+    struct _qsbr_thread_state qsbr;
+    char __padding[64 - sizeof(struct _qsbr_thread_state)];
+};
+
+// Per-interpreter state
+struct _qsbr_shared {
+    // Write sequence: always odd, incremented by two
+    uint64_t wr_seq;
+
+    // Minimum observed read sequence of all QSBR thread states
+    uint64_t rd_seq;
+
+    // Array of QSBR thread states.
+    struct _qsbr_pad *array;
+    Py_ssize_t size;
+
+    // Freelist of unused _qsbr_thread_states (protected by mutex)
+    PyMutex mutex;
+    struct _qsbr_thread_state *freelist;
+};
+
+static inline uint64_t
+_Py_qsbr_shared_current(struct _qsbr_shared *shared)
+{
+    return _Py_atomic_load_uint64_acquire(&shared->wr_seq);
+}
+
+// Reports a quiescent state: the caller no longer holds any pointer to shared
+// data not protected by locks or reference counts.
+static inline void
+_Py_qsbr_quiescent_state(struct _qsbr_thread_state *qsbr)
+{
+    uint64_t seq = _Py_qsbr_shared_current(qsbr->shared);
+    _Py_atomic_store_uint64_release(&qsbr->seq, seq);
+}
+
+// Have the read sequences advanced to the given goal? Like `_Py_qsbr_poll()`,
+// but does not perform a scan of threads.
+static inline bool
+_Py_qbsr_goal_reached(struct _qsbr_thread_state *qsbr, uint64_t goal)
+{
+    uint64_t rd_seq = _Py_atomic_load_uint64(&qsbr->shared->rd_seq);
+    return QSBR_LEQ(goal, rd_seq);
+}
+
+// Advance the write sequence and return the new goal. This should be called
+// after data is removed. The returned goal is used with `_Py_qsbr_poll()` to
+// determine when it is safe to reclaim (free) the memory.
+extern uint64_t
+_Py_qsbr_advance(struct _qsbr_shared *shared);
+
+// Return the next value for the write sequence (current plus the increment).
+extern uint64_t
+_Py_qsbr_shared_next(struct _qsbr_shared *shared);
+
+// Return true if deferred memory frees held by QSBR should be processed to
+// determine if they can be safely freed.
+static inline bool
+_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr)
+{
+    return qsbr->should_process;
+}
+
+// Have the read sequences advanced to the given goal? If this returns true,
+// it safe to reclaim any memory tagged with the goal (or earlier goal).
+extern bool
+_Py_qsbr_poll(struct _qsbr_thread_state *qsbr, uint64_t goal);
+
+// Called when thread attaches to interpreter
+extern void
+_Py_qsbr_attach(struct _qsbr_thread_state *qsbr);
+
+// Called when thread detaches from interpreter
+extern void
+_Py_qsbr_detach(struct _qsbr_thread_state *qsbr);
+
+// Reserves (allocates) a QSBR state and returns its index.
+extern Py_ssize_t
+_Py_qsbr_reserve(PyInterpreterState *interp);
+
+// Associates a PyThreadState with the QSBR state at the given index
+extern void
+_Py_qsbr_register(struct _PyThreadStateImpl *tstate,
+                  PyInterpreterState *interp, Py_ssize_t index);
+
+// Disassociates a PyThreadState from the QSBR state and frees the QSBR state.
+extern void
+_Py_qsbr_unregister(PyThreadState *tstate);
+
+extern void
+_Py_qsbr_fini(PyInterpreterState *interp);
+
+extern void
+_Py_qsbr_after_fork(struct _PyThreadStateImpl *tstate);
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_QSBR_H */
diff --git a/Include/internal/pycore_range.h b/Include/internal/pycore_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf045ec4fd8332de7d13c72558099f5db8bee086
--- /dev/null
+++ b/Include/internal/pycore_range.h
@@ -0,0 +1,21 @@
+#ifndef Py_INTERNAL_RANGE_H
+#define Py_INTERNAL_RANGE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    long start;
+    long step;
+    long len;
+} _PyRangeIterObject;
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_RANGE_H */
diff --git a/Include/internal/pycore_runtime.h b/Include/internal/pycore_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed028944d18e046317cfa3a62239c9e2e4c32c59
--- /dev/null
+++ b/Include/internal/pycore_runtime.h
@@ -0,0 +1,410 @@
+#ifndef Py_INTERNAL_RUNTIME_H
+#define Py_INTERNAL_RUNTIME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_atexit.h"          // struct _atexit_runtime_state
+#include "pycore_ceval_state.h"     // struct _ceval_runtime_state
+#include "pycore_crossinterp.h"   // struct _xidregistry
+#include "pycore_faulthandler.h"    // struct _faulthandler_runtime_state
+#include "pycore_floatobject.h"     // struct _Py_float_runtime_state
+#include "pycore_import.h"          // struct _import_runtime_state
+#include "pycore_interp.h"          // PyInterpreterState
+#include "pycore_object_state.h"    // struct _py_object_runtime_state
+#include "pycore_parser.h"          // struct _parser_runtime_state
+#include "pycore_pyhash.h"          // struct pyhash_runtime_state
+#include "pycore_pymem.h"           // struct _pymem_allocators
+#include "pycore_pythread.h"        // struct _pythread_runtime_state
+#include "pycore_signal.h"          // struct _signals_runtime_state
+#include "pycore_tracemalloc.h"     // struct _tracemalloc_runtime_state
+#include "pycore_typeobject.h"      // struct _types_runtime_state
+#include "pycore_unicodeobject.h"   // struct _Py_unicode_runtime_state
+
+struct _getargs_runtime_state {
+    struct _PyArg_Parser *static_parsers;
+};
+
+/* GIL state */
+
+struct _gilstate_runtime_state {
+    /* bpo-26558: Flag to disable PyGILState_Check().
+       If set to non-zero, PyGILState_Check() always return 1. */
+    int check_enabled;
+    /* The single PyInterpreterState used by this process'
+       GILState implementation
+    */
+    /* TODO: Given interp_main, it may be possible to kill this ref */
+    PyInterpreterState *autoInterpreterState;
+};
+
+/* Runtime audit hook state */
+
+#define _Py_Debug_Cookie "xdebugpy"
+
+#ifdef Py_GIL_DISABLED
+# define _Py_Debug_gilruntimestate_enabled offsetof(struct _gil_runtime_state, enabled)
+# define _Py_Debug_Free_Threaded 1
+#else
+# define _Py_Debug_gilruntimestate_enabled 0
+# define _Py_Debug_Free_Threaded 0
+#endif
+typedef struct _Py_AuditHookEntry {
+    struct _Py_AuditHookEntry *next;
+    Py_AuditHookFunction hookCFunction;
+    void *userData;
+} _Py_AuditHookEntry;
+
+typedef struct _Py_DebugOffsets {
+    char cookie[8] _Py_NONSTRING;
+    uint64_t version;
+    uint64_t free_threaded;
+    // Runtime state offset;
+    struct _runtime_state {
+        uint64_t size;
+        uint64_t finalizing;
+        uint64_t interpreters_head;
+    } runtime_state;
+
+    // Interpreter state offset;
+    struct _interpreter_state {
+        uint64_t size;
+        uint64_t id;
+        uint64_t next;
+        uint64_t threads_head;
+        uint64_t gc;
+        uint64_t imports_modules;
+        uint64_t sysdict;
+        uint64_t builtins;
+        uint64_t ceval_gil;
+        uint64_t gil_runtime_state;
+        uint64_t gil_runtime_state_enabled;
+        uint64_t gil_runtime_state_locked;
+        uint64_t gil_runtime_state_holder;
+    } interpreter_state;
+
+    // Thread state offset;
+    struct _thread_state{
+        uint64_t size;
+        uint64_t prev;
+        uint64_t next;
+        uint64_t interp;
+        uint64_t current_frame;
+        uint64_t thread_id;
+        uint64_t native_thread_id;
+        uint64_t datastack_chunk;
+        uint64_t status;
+    } thread_state;
+
+    // InterpreterFrame offset;
+    struct _interpreter_frame {
+        uint64_t size;
+        uint64_t previous;
+        uint64_t executable;
+        uint64_t instr_ptr;
+        uint64_t localsplus;
+        uint64_t owner;
+    } interpreter_frame;
+
+    // Code object offset;
+    struct _code_object {
+        uint64_t size;
+        uint64_t filename;
+        uint64_t name;
+        uint64_t qualname;
+        uint64_t linetable;
+        uint64_t firstlineno;
+        uint64_t argcount;
+        uint64_t localsplusnames;
+        uint64_t localspluskinds;
+        uint64_t co_code_adaptive;
+    } code_object;
+
+    // PyObject offset;
+    struct _pyobject {
+        uint64_t size;
+        uint64_t ob_type;
+    } pyobject;
+
+    // PyTypeObject object offset;
+    struct _type_object {
+        uint64_t size;
+        uint64_t tp_name;
+        uint64_t tp_repr;
+        uint64_t tp_flags;
+    } type_object;
+
+    // PyTuple object offset;
+    struct _tuple_object {
+        uint64_t size;
+        uint64_t ob_item;
+        uint64_t ob_size;
+    } tuple_object;
+
+    // PyList object offset;
+    struct _list_object {
+        uint64_t size;
+        uint64_t ob_item;
+        uint64_t ob_size;
+    } list_object;
+
+    // PyDict object offset;
+    struct _dict_object {
+        uint64_t size;
+        uint64_t ma_keys;
+        uint64_t ma_values;
+    } dict_object;
+
+    // PyFloat object offset;
+    struct _float_object {
+        uint64_t size;
+        uint64_t ob_fval;
+    } float_object;
+
+    // PyLong object offset;
+    struct _long_object {
+        uint64_t size;
+        uint64_t lv_tag;
+        uint64_t ob_digit;
+    } long_object;
+
+    // PyBytes object offset;
+    struct _bytes_object {
+        uint64_t size;
+        uint64_t ob_size;
+        uint64_t ob_sval;
+    } bytes_object;
+
+    // Unicode object offset;
+    struct _unicode_object {
+        uint64_t size;
+        uint64_t state;
+        uint64_t length;
+        uint64_t asciiobject_size;
+    } unicode_object;
+
+    // GC runtime state offset;
+    struct _gc {
+        uint64_t size;
+        uint64_t collecting;
+    } gc;
+} _Py_DebugOffsets;
+
+/* Reference tracer state */
+struct _reftracer_runtime_state {
+    PyRefTracer tracer_func;
+    void* tracer_data;
+};
+
+/* Full Python runtime state */
+
+/* _PyRuntimeState holds the global state for the CPython runtime.
+   That data is exposed in the internal API as a static variable (_PyRuntime).
+   */
+typedef struct pyruntimestate {
+    /* This field must be first to facilitate locating it by out of process
+     * debuggers. Out of process debuggers will use the offsets contained in this
+     * field to be able to locate other fields in several interpreter structures
+     * in a way that doesn't require them to know the exact layout of those
+     * structures.
+     *
+     * IMPORTANT:
+     * This struct is **NOT** backwards compatible between minor version of the
+     * interpreter and the members, order of members and size can change between
+     * minor versions. This struct is only guaranteed to be stable between patch
+     * versions for a given minor version of the interpreter.
+     */
+    _Py_DebugOffsets debug_offsets;
+
+    /* Has been initialized to a safe state.
+
+       In order to be effective, this must be set to 0 during or right
+       after allocation. */
+    int _initialized;
+
+    /* Is running Py_PreInitialize()? */
+    int preinitializing;
+
+    /* Is Python preinitialized? Set to 1 by Py_PreInitialize() */
+    int preinitialized;
+
+    /* Is Python core initialized? Set to 1 by _Py_InitializeCore() */
+    int core_initialized;
+
+    /* Is Python fully initialized? Set to 1 by Py_Initialize() */
+    int initialized;
+
+    /* Set by Py_FinalizeEx(). Only reset to NULL if Py_Initialize()
+       is called again.
+
+       Use _PyRuntimeState_GetFinalizing() and _PyRuntimeState_SetFinalizing()
+       to access it, don't access it directly. */
+    PyThreadState *_finalizing;
+    /* The ID of the OS thread in which we are finalizing. */
+    unsigned long _finalizing_id;
+
+    struct pyinterpreters {
+        PyMutex mutex;
+        /* The linked list of interpreters, newest first. */
+        PyInterpreterState *head;
+        /* The runtime's initial interpreter, which has a special role
+           in the operation of the runtime.  It is also often the only
+           interpreter. */
+        PyInterpreterState *main;
+        /* next_id is an auto-numbered sequence of small
+           integers.  It gets initialized in _PyInterpreterState_Enable(),
+           which is called in Py_Initialize(), and used in
+           PyInterpreterState_New().  A negative interpreter ID
+           indicates an error occurred.  The main interpreter will
+           always have an ID of 0.  Overflow results in a RuntimeError.
+           If that becomes a problem later then we can adjust, e.g. by
+           using a Python int. */
+        int64_t next_id;
+    } interpreters;
+
+    /* Platform-specific identifier and PyThreadState, respectively, for the
+       main thread in the main interpreter. */
+    unsigned long main_thread;
+    PyThreadState *main_tstate;
+
+    /* ---------- IMPORTANT ---------------------------
+     The fields above this line are declared as early as
+     possible to facilitate out-of-process observability
+     tools. */
+
+    /* cross-interpreter data and utils */
+    struct _xi_runtime_state xi;
+
+    struct _pymem_allocators allocators;
+    struct _obmalloc_global_state obmalloc;
+    struct pyhash_runtime_state pyhash_state;
+    struct _pythread_runtime_state threads;
+    struct _signals_runtime_state signals;
+
+    /* Used for the thread state bound to the current thread. */
+    Py_tss_t autoTSSkey;
+
+    /* Used instead of PyThreadState.trash when there is not current tstate. */
+    Py_tss_t trashTSSkey;
+
+    PyWideStringList orig_argv;
+
+    struct _parser_runtime_state parser;
+
+    struct _atexit_runtime_state atexit;
+
+    struct _import_runtime_state imports;
+    struct _ceval_runtime_state ceval;
+    struct _gilstate_runtime_state gilstate;
+    struct _getargs_runtime_state getargs;
+    struct _fileutils_state fileutils;
+    struct _faulthandler_runtime_state faulthandler;
+    struct _tracemalloc_runtime_state tracemalloc;
+    struct _reftracer_runtime_state ref_tracer;
+
+    // The rwmutex is used to prevent overlapping global and per-interpreter
+    // stop-the-world events. Global stop-the-world events lock the mutex
+    // exclusively (as a "writer"), while per-interpreter stop-the-world events
+    // lock it non-exclusively (as "readers").
+    _PyRWMutex stoptheworld_mutex;
+    struct _stoptheworld_state stoptheworld;
+
+    PyPreConfig preconfig;
+
+    // Audit values must be preserved when Py_Initialize()/Py_Finalize()
+    // is called multiple times.
+    Py_OpenCodeHookFunction open_code_hook;
+    void *open_code_userdata;
+    struct {
+        PyMutex mutex;
+        _Py_AuditHookEntry *head;
+    } audit_hooks;
+
+    struct _py_object_runtime_state object_state;
+    struct _Py_float_runtime_state float_state;
+    struct _Py_unicode_runtime_state unicode_state;
+    struct _types_runtime_state types;
+
+    /* All the objects that are shared by the runtime's interpreters. */
+    struct _Py_cached_objects cached_objects;
+    struct _Py_static_objects static_objects;
+
+    /* The following fields are here to avoid allocation during init.
+       The data is exposed through _PyRuntimeState pointer fields.
+       These fields should not be accessed directly outside of init.
+
+       All other _PyRuntimeState pointer fields are populated when
+       needed and default to NULL.
+
+       For now there are some exceptions to that rule, which require
+       allocation during init.  These will be addressed on a case-by-case
+       basis.  Most notably, we don't pre-allocated the several mutex
+       (PyThread_type_lock) fields, because on Windows we only ever get
+       a pointer type.
+       */
+
+    /* _PyRuntimeState.interpreters.main */
+    PyInterpreterState _main_interpreter;
+
+#if defined(__EMSCRIPTEN__) && defined(PY_CALL_TRAMPOLINE)
+    // Used in "Python/emscripten_trampoline.c" to choose between type
+    // reflection trampoline and EM_JS trampoline.
+    bool wasm_type_reflection_available;
+#endif
+
+} _PyRuntimeState;
+
+
+/* other API */
+
+// Export _PyRuntime for shared extensions which use it in static inline
+// functions for best performance, like _Py_IsMainThread() or _Py_ID().
+// It's also made accessible for debuggers and profilers.
+PyAPI_DATA(_PyRuntimeState) _PyRuntime;
+
+extern PyStatus _PyRuntimeState_Init(_PyRuntimeState *runtime);
+extern void _PyRuntimeState_Fini(_PyRuntimeState *runtime);
+
+#ifdef HAVE_FORK
+extern PyStatus _PyRuntimeState_ReInitThreads(_PyRuntimeState *runtime);
+#endif
+
+/* Initialize _PyRuntimeState.
+   Return NULL on success, or return an error message on failure. */
+extern PyStatus _PyRuntime_Initialize(void);
+
+extern void _PyRuntime_Finalize(void);
+
+
+static inline PyThreadState*
+_PyRuntimeState_GetFinalizing(_PyRuntimeState *runtime) {
+    return (PyThreadState*)_Py_atomic_load_ptr_relaxed(&runtime->_finalizing);
+}
+
+static inline unsigned long
+_PyRuntimeState_GetFinalizingID(_PyRuntimeState *runtime) {
+    return _Py_atomic_load_ulong_relaxed(&runtime->_finalizing_id);
+}
+
+static inline void
+_PyRuntimeState_SetFinalizing(_PyRuntimeState *runtime, PyThreadState *tstate) {
+    _Py_atomic_store_ptr_relaxed(&runtime->_finalizing, tstate);
+    if (tstate == NULL) {
+        _Py_atomic_store_ulong_relaxed(&runtime->_finalizing_id, 0);
+    }
+    else {
+        // XXX Re-enable this assert once gh-109860 is fixed.
+        //assert(tstate->thread_id == PyThread_get_thread_ident());
+        _Py_atomic_store_ulong_relaxed(&runtime->_finalizing_id,
+                                       tstate->thread_id);
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_RUNTIME_H */
diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..7eef9edc0aa33dd5d9c8eb67cff24907e4e98dc7
--- /dev/null
+++ b/Include/internal/pycore_runtime_init.h
@@ -0,0 +1,329 @@
+#ifndef Py_INTERNAL_RUNTIME_INIT_H
+#define Py_INTERNAL_RUNTIME_INIT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_ceval_state.h"   // _PyEval_RUNTIME_PERF_INIT
+#include "pycore_faulthandler.h"  // _faulthandler_runtime_state_INIT
+#include "pycore_floatobject.h"   // _py_float_format_unknown
+#include "pycore_object.h"        // _PyObject_HEAD_INIT
+#include "pycore_obmalloc_init.h" // _obmalloc_global_state_INIT
+#include "pycore_parser.h"        // _parser_runtime_state_INIT
+#include "pycore_pyhash.h"        // pyhash_state_INIT
+#include "pycore_pymem_init.h"    // _pymem_allocators_standard_INIT
+#include "pycore_pythread.h"      // _pythread_RUNTIME_INIT
+#include "pycore_qsbr.h"          // QSBR_INITIAL
+#include "pycore_runtime_init_generated.h"  // _Py_bytes_characters_INIT
+#include "pycore_signal.h"        // _signals_RUNTIME_INIT
+#include "pycore_tracemalloc.h"   // _tracemalloc_runtime_state_INIT
+
+
+extern PyTypeObject _PyExc_MemoryError;
+
+
+/* The static initializers defined here should only be used
+   in the runtime init code (in pystate.c and pylifecycle.c). */
+
+#define _PyRuntimeState_INIT(runtime, debug_cookie) \
+    { \
+        .debug_offsets = { \
+            .cookie = debug_cookie, \
+            .version = PY_VERSION_HEX, \
+            .free_threaded = _Py_Debug_Free_Threaded, \
+            .runtime_state = { \
+                .size = sizeof(_PyRuntimeState), \
+                .finalizing = offsetof(_PyRuntimeState, _finalizing), \
+                .interpreters_head = offsetof(_PyRuntimeState, interpreters.head), \
+            }, \
+            .interpreter_state = { \
+                .size = sizeof(PyInterpreterState), \
+                .id = offsetof(PyInterpreterState, id), \
+                .next = offsetof(PyInterpreterState, next), \
+                .threads_head = offsetof(PyInterpreterState, threads.head), \
+                .gc = offsetof(PyInterpreterState, gc), \
+                .imports_modules = offsetof(PyInterpreterState, imports.modules), \
+                .sysdict = offsetof(PyInterpreterState, sysdict), \
+                .builtins = offsetof(PyInterpreterState, builtins), \
+                .ceval_gil = offsetof(PyInterpreterState, ceval.gil), \
+                .gil_runtime_state = offsetof(PyInterpreterState, _gil), \
+                .gil_runtime_state_enabled = _Py_Debug_gilruntimestate_enabled, \
+                .gil_runtime_state_locked = offsetof(PyInterpreterState, _gil.locked), \
+                .gil_runtime_state_holder = offsetof(PyInterpreterState, _gil.last_holder), \
+            }, \
+            .thread_state = { \
+                .size = sizeof(PyThreadState), \
+                .prev = offsetof(PyThreadState, prev), \
+                .next = offsetof(PyThreadState, next), \
+                .interp = offsetof(PyThreadState, interp), \
+                .current_frame = offsetof(PyThreadState, current_frame), \
+                .thread_id = offsetof(PyThreadState, thread_id), \
+                .native_thread_id = offsetof(PyThreadState, native_thread_id), \
+                .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
+                .status = offsetof(PyThreadState, _status), \
+            }, \
+            .interpreter_frame = { \
+                .size = sizeof(_PyInterpreterFrame), \
+                .previous = offsetof(_PyInterpreterFrame, previous), \
+                .executable = offsetof(_PyInterpreterFrame, f_executable), \
+                .instr_ptr = offsetof(_PyInterpreterFrame, instr_ptr), \
+                .localsplus = offsetof(_PyInterpreterFrame, localsplus), \
+                .owner = offsetof(_PyInterpreterFrame, owner), \
+            }, \
+            .code_object = { \
+                .size = sizeof(PyCodeObject), \
+                .filename = offsetof(PyCodeObject, co_filename), \
+                .name = offsetof(PyCodeObject, co_name), \
+                .qualname = offsetof(PyCodeObject, co_qualname), \
+                .linetable = offsetof(PyCodeObject, co_linetable), \
+                .firstlineno = offsetof(PyCodeObject, co_firstlineno), \
+                .argcount = offsetof(PyCodeObject, co_argcount), \
+                .localsplusnames = offsetof(PyCodeObject, co_localsplusnames), \
+                .localspluskinds = offsetof(PyCodeObject, co_localspluskinds), \
+                .co_code_adaptive = offsetof(PyCodeObject, co_code_adaptive), \
+            }, \
+            .pyobject = { \
+                .size = sizeof(PyObject), \
+                .ob_type = offsetof(PyObject, ob_type), \
+            }, \
+            .type_object = { \
+                .size = sizeof(PyTypeObject), \
+                .tp_name = offsetof(PyTypeObject, tp_name), \
+                .tp_repr = offsetof(PyTypeObject, tp_repr), \
+                .tp_flags = offsetof(PyTypeObject, tp_flags), \
+            }, \
+            .tuple_object = { \
+                .size = sizeof(PyTupleObject), \
+                .ob_item = offsetof(PyTupleObject, ob_item), \
+                .ob_size = offsetof(PyTupleObject, ob_base.ob_size), \
+            }, \
+            .list_object = { \
+                .size = sizeof(PyListObject), \
+                .ob_item = offsetof(PyListObject, ob_item), \
+                .ob_size = offsetof(PyListObject, ob_base.ob_size), \
+            }, \
+            .dict_object = { \
+                .size = sizeof(PyDictObject), \
+                .ma_keys = offsetof(PyDictObject, ma_keys), \
+                .ma_values = offsetof(PyDictObject, ma_values), \
+            }, \
+            .float_object = { \
+                .size = sizeof(PyFloatObject), \
+                .ob_fval = offsetof(PyFloatObject, ob_fval), \
+            }, \
+            .long_object = { \
+                .size = sizeof(PyLongObject), \
+                .lv_tag = offsetof(PyLongObject, long_value.lv_tag), \
+                .ob_digit = offsetof(PyLongObject, long_value.ob_digit), \
+            }, \
+            .bytes_object = { \
+                .size = sizeof(PyBytesObject), \
+                .ob_size = offsetof(PyBytesObject, ob_base.ob_size), \
+                .ob_sval = offsetof(PyBytesObject, ob_sval), \
+            }, \
+            .unicode_object = { \
+                .size = sizeof(PyUnicodeObject), \
+                .state = offsetof(PyUnicodeObject, _base._base.state), \
+                .length = offsetof(PyUnicodeObject, _base._base.length), \
+                .asciiobject_size = sizeof(PyASCIIObject), \
+            }, \
+            .gc = { \
+                .size = sizeof(struct _gc_runtime_state), \
+                .collecting = offsetof(struct _gc_runtime_state, collecting), \
+            }, \
+        }, \
+        .allocators = { \
+            .standard = _pymem_allocators_standard_INIT(runtime), \
+            .debug = _pymem_allocators_debug_INIT, \
+            .obj_arena = _pymem_allocators_obj_arena_INIT, \
+            .is_debug_enabled = _pymem_is_debug_enabled_INIT, \
+        }, \
+        .obmalloc = _obmalloc_global_state_INIT, \
+        .pyhash_state = pyhash_state_INIT, \
+        .threads = _pythread_RUNTIME_INIT(runtime.threads), \
+        .signals = _signals_RUNTIME_INIT, \
+        .interpreters = { \
+            /* This prevents interpreters from getting created \
+              until _PyInterpreterState_Enable() is called. */ \
+            .next_id = -1, \
+        }, \
+        .xi = { \
+            .registry = { \
+                .global = 1, \
+            }, \
+        }, \
+        /* A TSS key must be initialized with Py_tss_NEEDS_INIT \
+           in accordance with the specification. */ \
+        .autoTSSkey = Py_tss_NEEDS_INIT, \
+        .parser = _parser_runtime_state_INIT, \
+        .ceval = { \
+            .pending_mainthread = { \
+                .max = MAXPENDINGCALLS_MAIN, \
+                .maxloop = MAXPENDINGCALLSLOOP_MAIN, \
+            }, \
+            .perf = _PyEval_RUNTIME_PERF_INIT, \
+        }, \
+        .gilstate = { \
+            .check_enabled = 1, \
+        }, \
+        .fileutils = { \
+            .force_ascii = -1, \
+        }, \
+        .faulthandler = _faulthandler_runtime_state_INIT, \
+        .tracemalloc = _tracemalloc_runtime_state_INIT, \
+        .ref_tracer = { \
+            .tracer_func = NULL, \
+            .tracer_data = NULL, \
+        }, \
+        .stoptheworld = { \
+            .is_global = 1, \
+        }, \
+        .float_state = { \
+            .float_format = _py_float_format_unknown, \
+            .double_format = _py_float_format_unknown, \
+        }, \
+        .types = { \
+            .next_version_tag = 1, \
+        }, \
+        .static_objects = { \
+            .singletons = { \
+                .small_ints = _Py_small_ints_INIT, \
+                .bytes_empty = _PyBytes_SIMPLE_INIT(0, 0), \
+                .bytes_characters = _Py_bytes_characters_INIT, \
+                .strings = { \
+                    .literals = _Py_str_literals_INIT, \
+                    .identifiers = _Py_str_identifiers_INIT, \
+                    .ascii = _Py_str_ascii_INIT, \
+                    .latin1 = _Py_str_latin1_INIT, \
+                }, \
+                .tuple_empty = { \
+                    .ob_base = _PyVarObject_HEAD_INIT(&PyTuple_Type, 0), \
+                }, \
+                .hamt_bitmap_node_empty = { \
+                    .ob_base = _PyVarObject_HEAD_INIT(&_PyHamt_BitmapNode_Type, 0), \
+                }, \
+                .context_token_missing = { \
+                    .ob_base = _PyObject_HEAD_INIT(&_PyContextTokenMissing_Type), \
+                }, \
+            }, \
+        }, \
+        ._main_interpreter = _PyInterpreterState_INIT(runtime._main_interpreter), \
+    }
+
+#define _PyInterpreterState_INIT(INTERP) \
+    { \
+        .id_refcount = -1, \
+        ._whence = _PyInterpreterState_WHENCE_NOTSET, \
+        .imports = IMPORTS_INIT, \
+        .ceval = { \
+            .recursion_limit = Py_DEFAULT_RECURSION_LIMIT, \
+            .pending = { \
+                .max = MAXPENDINGCALLS, \
+                .maxloop = MAXPENDINGCALLSLOOP, \
+            }, \
+        }, \
+        .gc = { \
+            .enabled = 1, \
+            .generations = { \
+                /* .head is set in _PyGC_InitState(). */ \
+                { .threshold = 2000, }, \
+                { .threshold = 10, }, \
+                { .threshold = 10, }, \
+            }, \
+        }, \
+        .qsbr = { \
+            .wr_seq = QSBR_INITIAL, \
+            .rd_seq = QSBR_INITIAL, \
+        }, \
+        .dtoa = _dtoa_state_INIT(&(INTERP)), \
+        .dict_state = _dict_state_INIT, \
+        .mem_free_queue = _Py_mem_free_queue_INIT(INTERP.mem_free_queue), \
+        .func_state = { \
+            .next_version = 1, \
+        }, \
+        .types = { \
+            .next_version_tag = _Py_TYPE_BASE_VERSION_TAG, \
+        }, \
+        .static_objects = { \
+            .singletons = { \
+                ._not_used = 1, \
+                .hamt_empty = { \
+                    .ob_base = _PyObject_HEAD_INIT(&_PyHamt_Type), \
+                    .h_root = (PyHamtNode*)&_Py_SINGLETON(hamt_bitmap_node_empty), \
+                }, \
+                .last_resort_memory_error = { \
+                    _PyObject_HEAD_INIT(&_PyExc_MemoryError), \
+                    .args = (PyObject*)&_Py_SINGLETON(tuple_empty) \
+                }, \
+            }, \
+        }, \
+        ._initial_thread = _PyThreadStateImpl_INIT, \
+    }
+
+#define _PyThreadStateImpl_INIT \
+    { \
+        .base = _PyThreadState_INIT, \
+    }
+
+#define _PyThreadState_INIT \
+    { \
+        ._whence = _PyThreadState_WHENCE_NOTSET, \
+        .py_recursion_limit = Py_DEFAULT_RECURSION_LIMIT, \
+        .context_ver = 1, \
+    }
+
+
+// global objects
+
+#define _PyBytes_SIMPLE_INIT(CH, LEN) \
+    { \
+        _PyVarObject_HEAD_INIT(&PyBytes_Type, (LEN)), \
+        .ob_shash = -1, \
+        .ob_sval = { (CH) }, \
+    }
+#define _PyBytes_CHAR_INIT(CH) \
+    { \
+        _PyBytes_SIMPLE_INIT((CH), 1) \
+    }
+
+#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII) \
+    { \
+        .ob_base = _PyObject_HEAD_INIT(&PyUnicode_Type), \
+        .length = sizeof(LITERAL) - 1, \
+        .hash = -1, \
+        .state = { \
+            .kind = 1, \
+            .compact = 1, \
+            .ascii = (ASCII), \
+            .statically_allocated = 1, \
+        }, \
+    }
+#define _PyASCIIObject_INIT(LITERAL) \
+    { \
+        ._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1), \
+        ._data = (LITERAL) \
+    }
+#define INIT_STR(NAME, LITERAL) \
+    ._py_ ## NAME = _PyASCIIObject_INIT(LITERAL)
+#define INIT_ID(NAME) \
+    ._py_ ## NAME = _PyASCIIObject_INIT(#NAME)
+#define _PyUnicode_LATIN1_INIT(LITERAL, UTF8) \
+    { \
+        ._latin1 = { \
+            ._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0), \
+            .utf8 = (UTF8), \
+            .utf8_length = sizeof(UTF8) - 1, \
+        }, \
+        ._data = (LITERAL), \
+    }
+
+#include "pycore_runtime_init_generated.h"
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_RUNTIME_INIT_H */
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
new file mode 100644
index 0000000000000000000000000000000000000000..19a6b9b1537d514d69969254ca19c6037eb461be
--- /dev/null
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -0,0 +1,1551 @@
+#ifndef Py_INTERNAL_RUNTIME_INIT_GENERATED_H
+#define Py_INTERNAL_RUNTIME_INIT_GENERATED_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_long.h"          // _PyLong_DIGIT_INIT()
+
+
+/* The following is auto-generated by Tools/build/generate_global_objects.py. */
+#define _Py_small_ints_INIT { \
+    _PyLong_DIGIT_INIT(-5), \
+    _PyLong_DIGIT_INIT(-4), \
+    _PyLong_DIGIT_INIT(-3), \
+    _PyLong_DIGIT_INIT(-2), \
+    _PyLong_DIGIT_INIT(-1), \
+    _PyLong_DIGIT_INIT(0), \
+    _PyLong_DIGIT_INIT(1), \
+    _PyLong_DIGIT_INIT(2), \
+    _PyLong_DIGIT_INIT(3), \
+    _PyLong_DIGIT_INIT(4), \
+    _PyLong_DIGIT_INIT(5), \
+    _PyLong_DIGIT_INIT(6), \
+    _PyLong_DIGIT_INIT(7), \
+    _PyLong_DIGIT_INIT(8), \
+    _PyLong_DIGIT_INIT(9), \
+    _PyLong_DIGIT_INIT(10), \
+    _PyLong_DIGIT_INIT(11), \
+    _PyLong_DIGIT_INIT(12), \
+    _PyLong_DIGIT_INIT(13), \
+    _PyLong_DIGIT_INIT(14), \
+    _PyLong_DIGIT_INIT(15), \
+    _PyLong_DIGIT_INIT(16), \
+    _PyLong_DIGIT_INIT(17), \
+    _PyLong_DIGIT_INIT(18), \
+    _PyLong_DIGIT_INIT(19), \
+    _PyLong_DIGIT_INIT(20), \
+    _PyLong_DIGIT_INIT(21), \
+    _PyLong_DIGIT_INIT(22), \
+    _PyLong_DIGIT_INIT(23), \
+    _PyLong_DIGIT_INIT(24), \
+    _PyLong_DIGIT_INIT(25), \
+    _PyLong_DIGIT_INIT(26), \
+    _PyLong_DIGIT_INIT(27), \
+    _PyLong_DIGIT_INIT(28), \
+    _PyLong_DIGIT_INIT(29), \
+    _PyLong_DIGIT_INIT(30), \
+    _PyLong_DIGIT_INIT(31), \
+    _PyLong_DIGIT_INIT(32), \
+    _PyLong_DIGIT_INIT(33), \
+    _PyLong_DIGIT_INIT(34), \
+    _PyLong_DIGIT_INIT(35), \
+    _PyLong_DIGIT_INIT(36), \
+    _PyLong_DIGIT_INIT(37), \
+    _PyLong_DIGIT_INIT(38), \
+    _PyLong_DIGIT_INIT(39), \
+    _PyLong_DIGIT_INIT(40), \
+    _PyLong_DIGIT_INIT(41), \
+    _PyLong_DIGIT_INIT(42), \
+    _PyLong_DIGIT_INIT(43), \
+    _PyLong_DIGIT_INIT(44), \
+    _PyLong_DIGIT_INIT(45), \
+    _PyLong_DIGIT_INIT(46), \
+    _PyLong_DIGIT_INIT(47), \
+    _PyLong_DIGIT_INIT(48), \
+    _PyLong_DIGIT_INIT(49), \
+    _PyLong_DIGIT_INIT(50), \
+    _PyLong_DIGIT_INIT(51), \
+    _PyLong_DIGIT_INIT(52), \
+    _PyLong_DIGIT_INIT(53), \
+    _PyLong_DIGIT_INIT(54), \
+    _PyLong_DIGIT_INIT(55), \
+    _PyLong_DIGIT_INIT(56), \
+    _PyLong_DIGIT_INIT(57), \
+    _PyLong_DIGIT_INIT(58), \
+    _PyLong_DIGIT_INIT(59), \
+    _PyLong_DIGIT_INIT(60), \
+    _PyLong_DIGIT_INIT(61), \
+    _PyLong_DIGIT_INIT(62), \
+    _PyLong_DIGIT_INIT(63), \
+    _PyLong_DIGIT_INIT(64), \
+    _PyLong_DIGIT_INIT(65), \
+    _PyLong_DIGIT_INIT(66), \
+    _PyLong_DIGIT_INIT(67), \
+    _PyLong_DIGIT_INIT(68), \
+    _PyLong_DIGIT_INIT(69), \
+    _PyLong_DIGIT_INIT(70), \
+    _PyLong_DIGIT_INIT(71), \
+    _PyLong_DIGIT_INIT(72), \
+    _PyLong_DIGIT_INIT(73), \
+    _PyLong_DIGIT_INIT(74), \
+    _PyLong_DIGIT_INIT(75), \
+    _PyLong_DIGIT_INIT(76), \
+    _PyLong_DIGIT_INIT(77), \
+    _PyLong_DIGIT_INIT(78), \
+    _PyLong_DIGIT_INIT(79), \
+    _PyLong_DIGIT_INIT(80), \
+    _PyLong_DIGIT_INIT(81), \
+    _PyLong_DIGIT_INIT(82), \
+    _PyLong_DIGIT_INIT(83), \
+    _PyLong_DIGIT_INIT(84), \
+    _PyLong_DIGIT_INIT(85), \
+    _PyLong_DIGIT_INIT(86), \
+    _PyLong_DIGIT_INIT(87), \
+    _PyLong_DIGIT_INIT(88), \
+    _PyLong_DIGIT_INIT(89), \
+    _PyLong_DIGIT_INIT(90), \
+    _PyLong_DIGIT_INIT(91), \
+    _PyLong_DIGIT_INIT(92), \
+    _PyLong_DIGIT_INIT(93), \
+    _PyLong_DIGIT_INIT(94), \
+    _PyLong_DIGIT_INIT(95), \
+    _PyLong_DIGIT_INIT(96), \
+    _PyLong_DIGIT_INIT(97), \
+    _PyLong_DIGIT_INIT(98), \
+    _PyLong_DIGIT_INIT(99), \
+    _PyLong_DIGIT_INIT(100), \
+    _PyLong_DIGIT_INIT(101), \
+    _PyLong_DIGIT_INIT(102), \
+    _PyLong_DIGIT_INIT(103), \
+    _PyLong_DIGIT_INIT(104), \
+    _PyLong_DIGIT_INIT(105), \
+    _PyLong_DIGIT_INIT(106), \
+    _PyLong_DIGIT_INIT(107), \
+    _PyLong_DIGIT_INIT(108), \
+    _PyLong_DIGIT_INIT(109), \
+    _PyLong_DIGIT_INIT(110), \
+    _PyLong_DIGIT_INIT(111), \
+    _PyLong_DIGIT_INIT(112), \
+    _PyLong_DIGIT_INIT(113), \
+    _PyLong_DIGIT_INIT(114), \
+    _PyLong_DIGIT_INIT(115), \
+    _PyLong_DIGIT_INIT(116), \
+    _PyLong_DIGIT_INIT(117), \
+    _PyLong_DIGIT_INIT(118), \
+    _PyLong_DIGIT_INIT(119), \
+    _PyLong_DIGIT_INIT(120), \
+    _PyLong_DIGIT_INIT(121), \
+    _PyLong_DIGIT_INIT(122), \
+    _PyLong_DIGIT_INIT(123), \
+    _PyLong_DIGIT_INIT(124), \
+    _PyLong_DIGIT_INIT(125), \
+    _PyLong_DIGIT_INIT(126), \
+    _PyLong_DIGIT_INIT(127), \
+    _PyLong_DIGIT_INIT(128), \
+    _PyLong_DIGIT_INIT(129), \
+    _PyLong_DIGIT_INIT(130), \
+    _PyLong_DIGIT_INIT(131), \
+    _PyLong_DIGIT_INIT(132), \
+    _PyLong_DIGIT_INIT(133), \
+    _PyLong_DIGIT_INIT(134), \
+    _PyLong_DIGIT_INIT(135), \
+    _PyLong_DIGIT_INIT(136), \
+    _PyLong_DIGIT_INIT(137), \
+    _PyLong_DIGIT_INIT(138), \
+    _PyLong_DIGIT_INIT(139), \
+    _PyLong_DIGIT_INIT(140), \
+    _PyLong_DIGIT_INIT(141), \
+    _PyLong_DIGIT_INIT(142), \
+    _PyLong_DIGIT_INIT(143), \
+    _PyLong_DIGIT_INIT(144), \
+    _PyLong_DIGIT_INIT(145), \
+    _PyLong_DIGIT_INIT(146), \
+    _PyLong_DIGIT_INIT(147), \
+    _PyLong_DIGIT_INIT(148), \
+    _PyLong_DIGIT_INIT(149), \
+    _PyLong_DIGIT_INIT(150), \
+    _PyLong_DIGIT_INIT(151), \
+    _PyLong_DIGIT_INIT(152), \
+    _PyLong_DIGIT_INIT(153), \
+    _PyLong_DIGIT_INIT(154), \
+    _PyLong_DIGIT_INIT(155), \
+    _PyLong_DIGIT_INIT(156), \
+    _PyLong_DIGIT_INIT(157), \
+    _PyLong_DIGIT_INIT(158), \
+    _PyLong_DIGIT_INIT(159), \
+    _PyLong_DIGIT_INIT(160), \
+    _PyLong_DIGIT_INIT(161), \
+    _PyLong_DIGIT_INIT(162), \
+    _PyLong_DIGIT_INIT(163), \
+    _PyLong_DIGIT_INIT(164), \
+    _PyLong_DIGIT_INIT(165), \
+    _PyLong_DIGIT_INIT(166), \
+    _PyLong_DIGIT_INIT(167), \
+    _PyLong_DIGIT_INIT(168), \
+    _PyLong_DIGIT_INIT(169), \
+    _PyLong_DIGIT_INIT(170), \
+    _PyLong_DIGIT_INIT(171), \
+    _PyLong_DIGIT_INIT(172), \
+    _PyLong_DIGIT_INIT(173), \
+    _PyLong_DIGIT_INIT(174), \
+    _PyLong_DIGIT_INIT(175), \
+    _PyLong_DIGIT_INIT(176), \
+    _PyLong_DIGIT_INIT(177), \
+    _PyLong_DIGIT_INIT(178), \
+    _PyLong_DIGIT_INIT(179), \
+    _PyLong_DIGIT_INIT(180), \
+    _PyLong_DIGIT_INIT(181), \
+    _PyLong_DIGIT_INIT(182), \
+    _PyLong_DIGIT_INIT(183), \
+    _PyLong_DIGIT_INIT(184), \
+    _PyLong_DIGIT_INIT(185), \
+    _PyLong_DIGIT_INIT(186), \
+    _PyLong_DIGIT_INIT(187), \
+    _PyLong_DIGIT_INIT(188), \
+    _PyLong_DIGIT_INIT(189), \
+    _PyLong_DIGIT_INIT(190), \
+    _PyLong_DIGIT_INIT(191), \
+    _PyLong_DIGIT_INIT(192), \
+    _PyLong_DIGIT_INIT(193), \
+    _PyLong_DIGIT_INIT(194), \
+    _PyLong_DIGIT_INIT(195), \
+    _PyLong_DIGIT_INIT(196), \
+    _PyLong_DIGIT_INIT(197), \
+    _PyLong_DIGIT_INIT(198), \
+    _PyLong_DIGIT_INIT(199), \
+    _PyLong_DIGIT_INIT(200), \
+    _PyLong_DIGIT_INIT(201), \
+    _PyLong_DIGIT_INIT(202), \
+    _PyLong_DIGIT_INIT(203), \
+    _PyLong_DIGIT_INIT(204), \
+    _PyLong_DIGIT_INIT(205), \
+    _PyLong_DIGIT_INIT(206), \
+    _PyLong_DIGIT_INIT(207), \
+    _PyLong_DIGIT_INIT(208), \
+    _PyLong_DIGIT_INIT(209), \
+    _PyLong_DIGIT_INIT(210), \
+    _PyLong_DIGIT_INIT(211), \
+    _PyLong_DIGIT_INIT(212), \
+    _PyLong_DIGIT_INIT(213), \
+    _PyLong_DIGIT_INIT(214), \
+    _PyLong_DIGIT_INIT(215), \
+    _PyLong_DIGIT_INIT(216), \
+    _PyLong_DIGIT_INIT(217), \
+    _PyLong_DIGIT_INIT(218), \
+    _PyLong_DIGIT_INIT(219), \
+    _PyLong_DIGIT_INIT(220), \
+    _PyLong_DIGIT_INIT(221), \
+    _PyLong_DIGIT_INIT(222), \
+    _PyLong_DIGIT_INIT(223), \
+    _PyLong_DIGIT_INIT(224), \
+    _PyLong_DIGIT_INIT(225), \
+    _PyLong_DIGIT_INIT(226), \
+    _PyLong_DIGIT_INIT(227), \
+    _PyLong_DIGIT_INIT(228), \
+    _PyLong_DIGIT_INIT(229), \
+    _PyLong_DIGIT_INIT(230), \
+    _PyLong_DIGIT_INIT(231), \
+    _PyLong_DIGIT_INIT(232), \
+    _PyLong_DIGIT_INIT(233), \
+    _PyLong_DIGIT_INIT(234), \
+    _PyLong_DIGIT_INIT(235), \
+    _PyLong_DIGIT_INIT(236), \
+    _PyLong_DIGIT_INIT(237), \
+    _PyLong_DIGIT_INIT(238), \
+    _PyLong_DIGIT_INIT(239), \
+    _PyLong_DIGIT_INIT(240), \
+    _PyLong_DIGIT_INIT(241), \
+    _PyLong_DIGIT_INIT(242), \
+    _PyLong_DIGIT_INIT(243), \
+    _PyLong_DIGIT_INIT(244), \
+    _PyLong_DIGIT_INIT(245), \
+    _PyLong_DIGIT_INIT(246), \
+    _PyLong_DIGIT_INIT(247), \
+    _PyLong_DIGIT_INIT(248), \
+    _PyLong_DIGIT_INIT(249), \
+    _PyLong_DIGIT_INIT(250), \
+    _PyLong_DIGIT_INIT(251), \
+    _PyLong_DIGIT_INIT(252), \
+    _PyLong_DIGIT_INIT(253), \
+    _PyLong_DIGIT_INIT(254), \
+    _PyLong_DIGIT_INIT(255), \
+    _PyLong_DIGIT_INIT(256), \
+}
+
+#define _Py_bytes_characters_INIT { \
+    _PyBytes_CHAR_INIT(0), \
+    _PyBytes_CHAR_INIT(1), \
+    _PyBytes_CHAR_INIT(2), \
+    _PyBytes_CHAR_INIT(3), \
+    _PyBytes_CHAR_INIT(4), \
+    _PyBytes_CHAR_INIT(5), \
+    _PyBytes_CHAR_INIT(6), \
+    _PyBytes_CHAR_INIT(7), \
+    _PyBytes_CHAR_INIT(8), \
+    _PyBytes_CHAR_INIT(9), \
+    _PyBytes_CHAR_INIT(10), \
+    _PyBytes_CHAR_INIT(11), \
+    _PyBytes_CHAR_INIT(12), \
+    _PyBytes_CHAR_INIT(13), \
+    _PyBytes_CHAR_INIT(14), \
+    _PyBytes_CHAR_INIT(15), \
+    _PyBytes_CHAR_INIT(16), \
+    _PyBytes_CHAR_INIT(17), \
+    _PyBytes_CHAR_INIT(18), \
+    _PyBytes_CHAR_INIT(19), \
+    _PyBytes_CHAR_INIT(20), \
+    _PyBytes_CHAR_INIT(21), \
+    _PyBytes_CHAR_INIT(22), \
+    _PyBytes_CHAR_INIT(23), \
+    _PyBytes_CHAR_INIT(24), \
+    _PyBytes_CHAR_INIT(25), \
+    _PyBytes_CHAR_INIT(26), \
+    _PyBytes_CHAR_INIT(27), \
+    _PyBytes_CHAR_INIT(28), \
+    _PyBytes_CHAR_INIT(29), \
+    _PyBytes_CHAR_INIT(30), \
+    _PyBytes_CHAR_INIT(31), \
+    _PyBytes_CHAR_INIT(32), \
+    _PyBytes_CHAR_INIT(33), \
+    _PyBytes_CHAR_INIT(34), \
+    _PyBytes_CHAR_INIT(35), \
+    _PyBytes_CHAR_INIT(36), \
+    _PyBytes_CHAR_INIT(37), \
+    _PyBytes_CHAR_INIT(38), \
+    _PyBytes_CHAR_INIT(39), \
+    _PyBytes_CHAR_INIT(40), \
+    _PyBytes_CHAR_INIT(41), \
+    _PyBytes_CHAR_INIT(42), \
+    _PyBytes_CHAR_INIT(43), \
+    _PyBytes_CHAR_INIT(44), \
+    _PyBytes_CHAR_INIT(45), \
+    _PyBytes_CHAR_INIT(46), \
+    _PyBytes_CHAR_INIT(47), \
+    _PyBytes_CHAR_INIT(48), \
+    _PyBytes_CHAR_INIT(49), \
+    _PyBytes_CHAR_INIT(50), \
+    _PyBytes_CHAR_INIT(51), \
+    _PyBytes_CHAR_INIT(52), \
+    _PyBytes_CHAR_INIT(53), \
+    _PyBytes_CHAR_INIT(54), \
+    _PyBytes_CHAR_INIT(55), \
+    _PyBytes_CHAR_INIT(56), \
+    _PyBytes_CHAR_INIT(57), \
+    _PyBytes_CHAR_INIT(58), \
+    _PyBytes_CHAR_INIT(59), \
+    _PyBytes_CHAR_INIT(60), \
+    _PyBytes_CHAR_INIT(61), \
+    _PyBytes_CHAR_INIT(62), \
+    _PyBytes_CHAR_INIT(63), \
+    _PyBytes_CHAR_INIT(64), \
+    _PyBytes_CHAR_INIT(65), \
+    _PyBytes_CHAR_INIT(66), \
+    _PyBytes_CHAR_INIT(67), \
+    _PyBytes_CHAR_INIT(68), \
+    _PyBytes_CHAR_INIT(69), \
+    _PyBytes_CHAR_INIT(70), \
+    _PyBytes_CHAR_INIT(71), \
+    _PyBytes_CHAR_INIT(72), \
+    _PyBytes_CHAR_INIT(73), \
+    _PyBytes_CHAR_INIT(74), \
+    _PyBytes_CHAR_INIT(75), \
+    _PyBytes_CHAR_INIT(76), \
+    _PyBytes_CHAR_INIT(77), \
+    _PyBytes_CHAR_INIT(78), \
+    _PyBytes_CHAR_INIT(79), \
+    _PyBytes_CHAR_INIT(80), \
+    _PyBytes_CHAR_INIT(81), \
+    _PyBytes_CHAR_INIT(82), \
+    _PyBytes_CHAR_INIT(83), \
+    _PyBytes_CHAR_INIT(84), \
+    _PyBytes_CHAR_INIT(85), \
+    _PyBytes_CHAR_INIT(86), \
+    _PyBytes_CHAR_INIT(87), \
+    _PyBytes_CHAR_INIT(88), \
+    _PyBytes_CHAR_INIT(89), \
+    _PyBytes_CHAR_INIT(90), \
+    _PyBytes_CHAR_INIT(91), \
+    _PyBytes_CHAR_INIT(92), \
+    _PyBytes_CHAR_INIT(93), \
+    _PyBytes_CHAR_INIT(94), \
+    _PyBytes_CHAR_INIT(95), \
+    _PyBytes_CHAR_INIT(96), \
+    _PyBytes_CHAR_INIT(97), \
+    _PyBytes_CHAR_INIT(98), \
+    _PyBytes_CHAR_INIT(99), \
+    _PyBytes_CHAR_INIT(100), \
+    _PyBytes_CHAR_INIT(101), \
+    _PyBytes_CHAR_INIT(102), \
+    _PyBytes_CHAR_INIT(103), \
+    _PyBytes_CHAR_INIT(104), \
+    _PyBytes_CHAR_INIT(105), \
+    _PyBytes_CHAR_INIT(106), \
+    _PyBytes_CHAR_INIT(107), \
+    _PyBytes_CHAR_INIT(108), \
+    _PyBytes_CHAR_INIT(109), \
+    _PyBytes_CHAR_INIT(110), \
+    _PyBytes_CHAR_INIT(111), \
+    _PyBytes_CHAR_INIT(112), \
+    _PyBytes_CHAR_INIT(113), \
+    _PyBytes_CHAR_INIT(114), \
+    _PyBytes_CHAR_INIT(115), \
+    _PyBytes_CHAR_INIT(116), \
+    _PyBytes_CHAR_INIT(117), \
+    _PyBytes_CHAR_INIT(118), \
+    _PyBytes_CHAR_INIT(119), \
+    _PyBytes_CHAR_INIT(120), \
+    _PyBytes_CHAR_INIT(121), \
+    _PyBytes_CHAR_INIT(122), \
+    _PyBytes_CHAR_INIT(123), \
+    _PyBytes_CHAR_INIT(124), \
+    _PyBytes_CHAR_INIT(125), \
+    _PyBytes_CHAR_INIT(126), \
+    _PyBytes_CHAR_INIT(127), \
+    _PyBytes_CHAR_INIT(128), \
+    _PyBytes_CHAR_INIT(129), \
+    _PyBytes_CHAR_INIT(130), \
+    _PyBytes_CHAR_INIT(131), \
+    _PyBytes_CHAR_INIT(132), \
+    _PyBytes_CHAR_INIT(133), \
+    _PyBytes_CHAR_INIT(134), \
+    _PyBytes_CHAR_INIT(135), \
+    _PyBytes_CHAR_INIT(136), \
+    _PyBytes_CHAR_INIT(137), \
+    _PyBytes_CHAR_INIT(138), \
+    _PyBytes_CHAR_INIT(139), \
+    _PyBytes_CHAR_INIT(140), \
+    _PyBytes_CHAR_INIT(141), \
+    _PyBytes_CHAR_INIT(142), \
+    _PyBytes_CHAR_INIT(143), \
+    _PyBytes_CHAR_INIT(144), \
+    _PyBytes_CHAR_INIT(145), \
+    _PyBytes_CHAR_INIT(146), \
+    _PyBytes_CHAR_INIT(147), \
+    _PyBytes_CHAR_INIT(148), \
+    _PyBytes_CHAR_INIT(149), \
+    _PyBytes_CHAR_INIT(150), \
+    _PyBytes_CHAR_INIT(151), \
+    _PyBytes_CHAR_INIT(152), \
+    _PyBytes_CHAR_INIT(153), \
+    _PyBytes_CHAR_INIT(154), \
+    _PyBytes_CHAR_INIT(155), \
+    _PyBytes_CHAR_INIT(156), \
+    _PyBytes_CHAR_INIT(157), \
+    _PyBytes_CHAR_INIT(158), \
+    _PyBytes_CHAR_INIT(159), \
+    _PyBytes_CHAR_INIT(160), \
+    _PyBytes_CHAR_INIT(161), \
+    _PyBytes_CHAR_INIT(162), \
+    _PyBytes_CHAR_INIT(163), \
+    _PyBytes_CHAR_INIT(164), \
+    _PyBytes_CHAR_INIT(165), \
+    _PyBytes_CHAR_INIT(166), \
+    _PyBytes_CHAR_INIT(167), \
+    _PyBytes_CHAR_INIT(168), \
+    _PyBytes_CHAR_INIT(169), \
+    _PyBytes_CHAR_INIT(170), \
+    _PyBytes_CHAR_INIT(171), \
+    _PyBytes_CHAR_INIT(172), \
+    _PyBytes_CHAR_INIT(173), \
+    _PyBytes_CHAR_INIT(174), \
+    _PyBytes_CHAR_INIT(175), \
+    _PyBytes_CHAR_INIT(176), \
+    _PyBytes_CHAR_INIT(177), \
+    _PyBytes_CHAR_INIT(178), \
+    _PyBytes_CHAR_INIT(179), \
+    _PyBytes_CHAR_INIT(180), \
+    _PyBytes_CHAR_INIT(181), \
+    _PyBytes_CHAR_INIT(182), \
+    _PyBytes_CHAR_INIT(183), \
+    _PyBytes_CHAR_INIT(184), \
+    _PyBytes_CHAR_INIT(185), \
+    _PyBytes_CHAR_INIT(186), \
+    _PyBytes_CHAR_INIT(187), \
+    _PyBytes_CHAR_INIT(188), \
+    _PyBytes_CHAR_INIT(189), \
+    _PyBytes_CHAR_INIT(190), \
+    _PyBytes_CHAR_INIT(191), \
+    _PyBytes_CHAR_INIT(192), \
+    _PyBytes_CHAR_INIT(193), \
+    _PyBytes_CHAR_INIT(194), \
+    _PyBytes_CHAR_INIT(195), \
+    _PyBytes_CHAR_INIT(196), \
+    _PyBytes_CHAR_INIT(197), \
+    _PyBytes_CHAR_INIT(198), \
+    _PyBytes_CHAR_INIT(199), \
+    _PyBytes_CHAR_INIT(200), \
+    _PyBytes_CHAR_INIT(201), \
+    _PyBytes_CHAR_INIT(202), \
+    _PyBytes_CHAR_INIT(203), \
+    _PyBytes_CHAR_INIT(204), \
+    _PyBytes_CHAR_INIT(205), \
+    _PyBytes_CHAR_INIT(206), \
+    _PyBytes_CHAR_INIT(207), \
+    _PyBytes_CHAR_INIT(208), \
+    _PyBytes_CHAR_INIT(209), \
+    _PyBytes_CHAR_INIT(210), \
+    _PyBytes_CHAR_INIT(211), \
+    _PyBytes_CHAR_INIT(212), \
+    _PyBytes_CHAR_INIT(213), \
+    _PyBytes_CHAR_INIT(214), \
+    _PyBytes_CHAR_INIT(215), \
+    _PyBytes_CHAR_INIT(216), \
+    _PyBytes_CHAR_INIT(217), \
+    _PyBytes_CHAR_INIT(218), \
+    _PyBytes_CHAR_INIT(219), \
+    _PyBytes_CHAR_INIT(220), \
+    _PyBytes_CHAR_INIT(221), \
+    _PyBytes_CHAR_INIT(222), \
+    _PyBytes_CHAR_INIT(223), \
+    _PyBytes_CHAR_INIT(224), \
+    _PyBytes_CHAR_INIT(225), \
+    _PyBytes_CHAR_INIT(226), \
+    _PyBytes_CHAR_INIT(227), \
+    _PyBytes_CHAR_INIT(228), \
+    _PyBytes_CHAR_INIT(229), \
+    _PyBytes_CHAR_INIT(230), \
+    _PyBytes_CHAR_INIT(231), \
+    _PyBytes_CHAR_INIT(232), \
+    _PyBytes_CHAR_INIT(233), \
+    _PyBytes_CHAR_INIT(234), \
+    _PyBytes_CHAR_INIT(235), \
+    _PyBytes_CHAR_INIT(236), \
+    _PyBytes_CHAR_INIT(237), \
+    _PyBytes_CHAR_INIT(238), \
+    _PyBytes_CHAR_INIT(239), \
+    _PyBytes_CHAR_INIT(240), \
+    _PyBytes_CHAR_INIT(241), \
+    _PyBytes_CHAR_INIT(242), \
+    _PyBytes_CHAR_INIT(243), \
+    _PyBytes_CHAR_INIT(244), \
+    _PyBytes_CHAR_INIT(245), \
+    _PyBytes_CHAR_INIT(246), \
+    _PyBytes_CHAR_INIT(247), \
+    _PyBytes_CHAR_INIT(248), \
+    _PyBytes_CHAR_INIT(249), \
+    _PyBytes_CHAR_INIT(250), \
+    _PyBytes_CHAR_INIT(251), \
+    _PyBytes_CHAR_INIT(252), \
+    _PyBytes_CHAR_INIT(253), \
+    _PyBytes_CHAR_INIT(254), \
+    _PyBytes_CHAR_INIT(255), \
+}
+
+#define _Py_str_literals_INIT { \
+    INIT_STR(anon_dictcomp, "<dictcomp>"), \
+    INIT_STR(anon_genexpr, "<genexpr>"), \
+    INIT_STR(anon_lambda, "<lambda>"), \
+    INIT_STR(anon_listcomp, "<listcomp>"), \
+    INIT_STR(anon_module, "<module>"), \
+    INIT_STR(anon_null, "<NULL>"), \
+    INIT_STR(anon_setcomp, "<setcomp>"), \
+    INIT_STR(anon_string, "<string>"), \
+    INIT_STR(anon_unknown, "<unknown>"), \
+    INIT_STR(dbl_close_br, "}}"), \
+    INIT_STR(dbl_open_br, "{{"), \
+    INIT_STR(dbl_percent, "%%"), \
+    INIT_STR(defaults, ".defaults"), \
+    INIT_STR(dot_locals, ".<locals>"), \
+    INIT_STR(empty, ""), \
+    INIT_STR(generic_base, ".generic_base"), \
+    INIT_STR(json_decoder, "json.decoder"), \
+    INIT_STR(kwdefaults, ".kwdefaults"), \
+    INIT_STR(list_err, "list index out of range"), \
+    INIT_STR(str_replace_inf, "1e309"), \
+    INIT_STR(type_params, ".type_params"), \
+    INIT_STR(utf_8, "utf-8"), \
+}
+
+#define _Py_str_identifiers_INIT { \
+    INIT_ID(CANCELLED), \
+    INIT_ID(FINISHED), \
+    INIT_ID(False), \
+    INIT_ID(JSONDecodeError), \
+    INIT_ID(PENDING), \
+    INIT_ID(Py_Repr), \
+    INIT_ID(TextIOWrapper), \
+    INIT_ID(True), \
+    INIT_ID(WarningMessage), \
+    INIT_ID(_WindowsConsoleIO), \
+    INIT_ID(__IOBase_closed), \
+    INIT_ID(__abc_tpflags__), \
+    INIT_ID(__abs__), \
+    INIT_ID(__abstractmethods__), \
+    INIT_ID(__add__), \
+    INIT_ID(__aenter__), \
+    INIT_ID(__aexit__), \
+    INIT_ID(__aiter__), \
+    INIT_ID(__all__), \
+    INIT_ID(__and__), \
+    INIT_ID(__anext__), \
+    INIT_ID(__annotations__), \
+    INIT_ID(__args__), \
+    INIT_ID(__await__), \
+    INIT_ID(__bases__), \
+    INIT_ID(__bool__), \
+    INIT_ID(__buffer__), \
+    INIT_ID(__build_class__), \
+    INIT_ID(__builtins__), \
+    INIT_ID(__bytes__), \
+    INIT_ID(__call__), \
+    INIT_ID(__cantrace__), \
+    INIT_ID(__class__), \
+    INIT_ID(__class_getitem__), \
+    INIT_ID(__classcell__), \
+    INIT_ID(__classdict__), \
+    INIT_ID(__classdictcell__), \
+    INIT_ID(__complex__), \
+    INIT_ID(__contains__), \
+    INIT_ID(__copy__), \
+    INIT_ID(__ctypes_from_outparam__), \
+    INIT_ID(__del__), \
+    INIT_ID(__delattr__), \
+    INIT_ID(__delete__), \
+    INIT_ID(__delitem__), \
+    INIT_ID(__dict__), \
+    INIT_ID(__dictoffset__), \
+    INIT_ID(__dir__), \
+    INIT_ID(__divmod__), \
+    INIT_ID(__doc__), \
+    INIT_ID(__enter__), \
+    INIT_ID(__eq__), \
+    INIT_ID(__exit__), \
+    INIT_ID(__file__), \
+    INIT_ID(__firstlineno__), \
+    INIT_ID(__float__), \
+    INIT_ID(__floordiv__), \
+    INIT_ID(__format__), \
+    INIT_ID(__fspath__), \
+    INIT_ID(__ge__), \
+    INIT_ID(__get__), \
+    INIT_ID(__getattr__), \
+    INIT_ID(__getattribute__), \
+    INIT_ID(__getinitargs__), \
+    INIT_ID(__getitem__), \
+    INIT_ID(__getnewargs__), \
+    INIT_ID(__getnewargs_ex__), \
+    INIT_ID(__getstate__), \
+    INIT_ID(__gt__), \
+    INIT_ID(__hash__), \
+    INIT_ID(__iadd__), \
+    INIT_ID(__iand__), \
+    INIT_ID(__ifloordiv__), \
+    INIT_ID(__ilshift__), \
+    INIT_ID(__imatmul__), \
+    INIT_ID(__imod__), \
+    INIT_ID(__import__), \
+    INIT_ID(__imul__), \
+    INIT_ID(__index__), \
+    INIT_ID(__init__), \
+    INIT_ID(__init_subclass__), \
+    INIT_ID(__instancecheck__), \
+    INIT_ID(__int__), \
+    INIT_ID(__invert__), \
+    INIT_ID(__ior__), \
+    INIT_ID(__ipow__), \
+    INIT_ID(__irshift__), \
+    INIT_ID(__isabstractmethod__), \
+    INIT_ID(__isub__), \
+    INIT_ID(__iter__), \
+    INIT_ID(__itruediv__), \
+    INIT_ID(__ixor__), \
+    INIT_ID(__le__), \
+    INIT_ID(__len__), \
+    INIT_ID(__length_hint__), \
+    INIT_ID(__lltrace__), \
+    INIT_ID(__loader__), \
+    INIT_ID(__lshift__), \
+    INIT_ID(__lt__), \
+    INIT_ID(__main__), \
+    INIT_ID(__match_args__), \
+    INIT_ID(__matmul__), \
+    INIT_ID(__missing__), \
+    INIT_ID(__mod__), \
+    INIT_ID(__module__), \
+    INIT_ID(__mro_entries__), \
+    INIT_ID(__mul__), \
+    INIT_ID(__name__), \
+    INIT_ID(__ne__), \
+    INIT_ID(__neg__), \
+    INIT_ID(__new__), \
+    INIT_ID(__newobj__), \
+    INIT_ID(__newobj_ex__), \
+    INIT_ID(__next__), \
+    INIT_ID(__notes__), \
+    INIT_ID(__or__), \
+    INIT_ID(__orig_class__), \
+    INIT_ID(__origin__), \
+    INIT_ID(__package__), \
+    INIT_ID(__parameters__), \
+    INIT_ID(__path__), \
+    INIT_ID(__pos__), \
+    INIT_ID(__pow__), \
+    INIT_ID(__prepare__), \
+    INIT_ID(__qualname__), \
+    INIT_ID(__radd__), \
+    INIT_ID(__rand__), \
+    INIT_ID(__rdivmod__), \
+    INIT_ID(__reduce__), \
+    INIT_ID(__reduce_ex__), \
+    INIT_ID(__release_buffer__), \
+    INIT_ID(__repr__), \
+    INIT_ID(__reversed__), \
+    INIT_ID(__rfloordiv__), \
+    INIT_ID(__rlshift__), \
+    INIT_ID(__rmatmul__), \
+    INIT_ID(__rmod__), \
+    INIT_ID(__rmul__), \
+    INIT_ID(__ror__), \
+    INIT_ID(__round__), \
+    INIT_ID(__rpow__), \
+    INIT_ID(__rrshift__), \
+    INIT_ID(__rshift__), \
+    INIT_ID(__rsub__), \
+    INIT_ID(__rtruediv__), \
+    INIT_ID(__rxor__), \
+    INIT_ID(__set__), \
+    INIT_ID(__set_name__), \
+    INIT_ID(__setattr__), \
+    INIT_ID(__setitem__), \
+    INIT_ID(__setstate__), \
+    INIT_ID(__sizeof__), \
+    INIT_ID(__slotnames__), \
+    INIT_ID(__slots__), \
+    INIT_ID(__spec__), \
+    INIT_ID(__static_attributes__), \
+    INIT_ID(__str__), \
+    INIT_ID(__sub__), \
+    INIT_ID(__subclasscheck__), \
+    INIT_ID(__subclasshook__), \
+    INIT_ID(__truediv__), \
+    INIT_ID(__trunc__), \
+    INIT_ID(__type_params__), \
+    INIT_ID(__typing_is_unpacked_typevartuple__), \
+    INIT_ID(__typing_prepare_subst__), \
+    INIT_ID(__typing_subst__), \
+    INIT_ID(__typing_unpacked_tuple_args__), \
+    INIT_ID(__warningregistry__), \
+    INIT_ID(__weaklistoffset__), \
+    INIT_ID(__weakref__), \
+    INIT_ID(__xor__), \
+    INIT_ID(_abc_impl), \
+    INIT_ID(_abstract_), \
+    INIT_ID(_active), \
+    INIT_ID(_align_), \
+    INIT_ID(_annotation), \
+    INIT_ID(_anonymous_), \
+    INIT_ID(_argtypes_), \
+    INIT_ID(_as_parameter_), \
+    INIT_ID(_asyncio_future_blocking), \
+    INIT_ID(_blksize), \
+    INIT_ID(_bootstrap), \
+    INIT_ID(_check_retval_), \
+    INIT_ID(_dealloc_warn), \
+    INIT_ID(_feature_version), \
+    INIT_ID(_field_types), \
+    INIT_ID(_fields_), \
+    INIT_ID(_finalizing), \
+    INIT_ID(_find_and_load), \
+    INIT_ID(_fix_up_module), \
+    INIT_ID(_flags_), \
+    INIT_ID(_get_sourcefile), \
+    INIT_ID(_handle_fromlist), \
+    INIT_ID(_initializing), \
+    INIT_ID(_io), \
+    INIT_ID(_is_text_encoding), \
+    INIT_ID(_length_), \
+    INIT_ID(_limbo), \
+    INIT_ID(_lock_unlock_module), \
+    INIT_ID(_loop), \
+    INIT_ID(_needs_com_addref_), \
+    INIT_ID(_only_immortal), \
+    INIT_ID(_pack_), \
+    INIT_ID(_restype_), \
+    INIT_ID(_showwarnmsg), \
+    INIT_ID(_shutdown), \
+    INIT_ID(_slotnames), \
+    INIT_ID(_strptime), \
+    INIT_ID(_strptime_datetime), \
+    INIT_ID(_swappedbytes_), \
+    INIT_ID(_type_), \
+    INIT_ID(_uninitialized_submodules), \
+    INIT_ID(_warn_unawaited_coroutine), \
+    INIT_ID(_xoptions), \
+    INIT_ID(abs_tol), \
+    INIT_ID(access), \
+    INIT_ID(aclose), \
+    INIT_ID(add), \
+    INIT_ID(add_done_callback), \
+    INIT_ID(after_in_child), \
+    INIT_ID(after_in_parent), \
+    INIT_ID(aggregate_class), \
+    INIT_ID(alias), \
+    INIT_ID(allow_code), \
+    INIT_ID(append), \
+    INIT_ID(arg), \
+    INIT_ID(argdefs), \
+    INIT_ID(args), \
+    INIT_ID(arguments), \
+    INIT_ID(argv), \
+    INIT_ID(as_integer_ratio), \
+    INIT_ID(asend), \
+    INIT_ID(ast), \
+    INIT_ID(athrow), \
+    INIT_ID(attribute), \
+    INIT_ID(authorizer_callback), \
+    INIT_ID(autocommit), \
+    INIT_ID(backtick), \
+    INIT_ID(base), \
+    INIT_ID(before), \
+    INIT_ID(big), \
+    INIT_ID(binary_form), \
+    INIT_ID(block), \
+    INIT_ID(bound), \
+    INIT_ID(buffer), \
+    INIT_ID(buffer_callback), \
+    INIT_ID(buffer_size), \
+    INIT_ID(buffering), \
+    INIT_ID(buffers), \
+    INIT_ID(bufsize), \
+    INIT_ID(builtins), \
+    INIT_ID(byteorder), \
+    INIT_ID(bytes), \
+    INIT_ID(bytes_per_sep), \
+    INIT_ID(c_call), \
+    INIT_ID(c_exception), \
+    INIT_ID(c_return), \
+    INIT_ID(cached_datetime_module), \
+    INIT_ID(cached_statements), \
+    INIT_ID(cadata), \
+    INIT_ID(cafile), \
+    INIT_ID(call), \
+    INIT_ID(call_exception_handler), \
+    INIT_ID(call_soon), \
+    INIT_ID(callback), \
+    INIT_ID(cancel), \
+    INIT_ID(capath), \
+    INIT_ID(category), \
+    INIT_ID(cb_type), \
+    INIT_ID(certfile), \
+    INIT_ID(check_same_thread), \
+    INIT_ID(clear), \
+    INIT_ID(close), \
+    INIT_ID(closed), \
+    INIT_ID(closefd), \
+    INIT_ID(closure), \
+    INIT_ID(co_argcount), \
+    INIT_ID(co_cellvars), \
+    INIT_ID(co_code), \
+    INIT_ID(co_consts), \
+    INIT_ID(co_exceptiontable), \
+    INIT_ID(co_filename), \
+    INIT_ID(co_firstlineno), \
+    INIT_ID(co_flags), \
+    INIT_ID(co_freevars), \
+    INIT_ID(co_kwonlyargcount), \
+    INIT_ID(co_linetable), \
+    INIT_ID(co_name), \
+    INIT_ID(co_names), \
+    INIT_ID(co_nlocals), \
+    INIT_ID(co_posonlyargcount), \
+    INIT_ID(co_qualname), \
+    INIT_ID(co_stacksize), \
+    INIT_ID(co_varnames), \
+    INIT_ID(code), \
+    INIT_ID(col_offset), \
+    INIT_ID(command), \
+    INIT_ID(comment_factory), \
+    INIT_ID(compile_mode), \
+    INIT_ID(consts), \
+    INIT_ID(context), \
+    INIT_ID(contravariant), \
+    INIT_ID(cookie), \
+    INIT_ID(copy), \
+    INIT_ID(copyreg), \
+    INIT_ID(coro), \
+    INIT_ID(count), \
+    INIT_ID(covariant), \
+    INIT_ID(cwd), \
+    INIT_ID(data), \
+    INIT_ID(database), \
+    INIT_ID(day), \
+    INIT_ID(decode), \
+    INIT_ID(decoder), \
+    INIT_ID(default), \
+    INIT_ID(defaultaction), \
+    INIT_ID(delete), \
+    INIT_ID(depth), \
+    INIT_ID(desired_access), \
+    INIT_ID(detect_types), \
+    INIT_ID(deterministic), \
+    INIT_ID(device), \
+    INIT_ID(dict), \
+    INIT_ID(dictcomp), \
+    INIT_ID(difference_update), \
+    INIT_ID(digest), \
+    INIT_ID(digest_size), \
+    INIT_ID(digestmod), \
+    INIT_ID(dir_fd), \
+    INIT_ID(discard), \
+    INIT_ID(dispatch_table), \
+    INIT_ID(displayhook), \
+    INIT_ID(dklen), \
+    INIT_ID(doc), \
+    INIT_ID(dont_inherit), \
+    INIT_ID(dst), \
+    INIT_ID(dst_dir_fd), \
+    INIT_ID(eager_start), \
+    INIT_ID(effective_ids), \
+    INIT_ID(element_factory), \
+    INIT_ID(encode), \
+    INIT_ID(encoding), \
+    INIT_ID(end), \
+    INIT_ID(end_col_offset), \
+    INIT_ID(end_lineno), \
+    INIT_ID(end_offset), \
+    INIT_ID(endpos), \
+    INIT_ID(entrypoint), \
+    INIT_ID(env), \
+    INIT_ID(errors), \
+    INIT_ID(event), \
+    INIT_ID(eventmask), \
+    INIT_ID(exc_type), \
+    INIT_ID(exc_value), \
+    INIT_ID(excepthook), \
+    INIT_ID(exception), \
+    INIT_ID(existing_file_name), \
+    INIT_ID(exp), \
+    INIT_ID(extend), \
+    INIT_ID(extra_tokens), \
+    INIT_ID(facility), \
+    INIT_ID(factory), \
+    INIT_ID(false), \
+    INIT_ID(family), \
+    INIT_ID(fanout), \
+    INIT_ID(fd), \
+    INIT_ID(fd2), \
+    INIT_ID(fdel), \
+    INIT_ID(fget), \
+    INIT_ID(file), \
+    INIT_ID(file_actions), \
+    INIT_ID(filename), \
+    INIT_ID(fileno), \
+    INIT_ID(filepath), \
+    INIT_ID(fillvalue), \
+    INIT_ID(filter), \
+    INIT_ID(filters), \
+    INIT_ID(final), \
+    INIT_ID(find_class), \
+    INIT_ID(fix_imports), \
+    INIT_ID(flags), \
+    INIT_ID(flush), \
+    INIT_ID(fold), \
+    INIT_ID(follow_symlinks), \
+    INIT_ID(format), \
+    INIT_ID(from_param), \
+    INIT_ID(fromlist), \
+    INIT_ID(fromtimestamp), \
+    INIT_ID(fromutc), \
+    INIT_ID(fset), \
+    INIT_ID(func), \
+    INIT_ID(future), \
+    INIT_ID(generation), \
+    INIT_ID(genexpr), \
+    INIT_ID(get), \
+    INIT_ID(get_debug), \
+    INIT_ID(get_event_loop), \
+    INIT_ID(get_loop), \
+    INIT_ID(get_source), \
+    INIT_ID(getattr), \
+    INIT_ID(getstate), \
+    INIT_ID(gid), \
+    INIT_ID(globals), \
+    INIT_ID(groupindex), \
+    INIT_ID(groups), \
+    INIT_ID(handle), \
+    INIT_ID(handle_seq), \
+    INIT_ID(has_location), \
+    INIT_ID(hash_name), \
+    INIT_ID(header), \
+    INIT_ID(headers), \
+    INIT_ID(hi), \
+    INIT_ID(hook), \
+    INIT_ID(hour), \
+    INIT_ID(ident), \
+    INIT_ID(identity_hint), \
+    INIT_ID(ignore), \
+    INIT_ID(imag), \
+    INIT_ID(importlib), \
+    INIT_ID(in_fd), \
+    INIT_ID(incoming), \
+    INIT_ID(indexgroup), \
+    INIT_ID(inf), \
+    INIT_ID(infer_variance), \
+    INIT_ID(inherit_handle), \
+    INIT_ID(inheritable), \
+    INIT_ID(initial), \
+    INIT_ID(initial_bytes), \
+    INIT_ID(initial_owner), \
+    INIT_ID(initial_state), \
+    INIT_ID(initial_value), \
+    INIT_ID(initval), \
+    INIT_ID(inner_size), \
+    INIT_ID(input), \
+    INIT_ID(insert_comments), \
+    INIT_ID(insert_pis), \
+    INIT_ID(instructions), \
+    INIT_ID(intern), \
+    INIT_ID(intersection), \
+    INIT_ID(interval), \
+    INIT_ID(is_running), \
+    INIT_ID(isatty), \
+    INIT_ID(isinstance), \
+    INIT_ID(isoformat), \
+    INIT_ID(isolation_level), \
+    INIT_ID(istext), \
+    INIT_ID(item), \
+    INIT_ID(items), \
+    INIT_ID(iter), \
+    INIT_ID(iterable), \
+    INIT_ID(iterations), \
+    INIT_ID(join), \
+    INIT_ID(jump), \
+    INIT_ID(keepends), \
+    INIT_ID(key), \
+    INIT_ID(keyfile), \
+    INIT_ID(keys), \
+    INIT_ID(kind), \
+    INIT_ID(kw), \
+    INIT_ID(kw1), \
+    INIT_ID(kw2), \
+    INIT_ID(kwdefaults), \
+    INIT_ID(label), \
+    INIT_ID(lambda), \
+    INIT_ID(last), \
+    INIT_ID(last_exc), \
+    INIT_ID(last_node), \
+    INIT_ID(last_traceback), \
+    INIT_ID(last_type), \
+    INIT_ID(last_value), \
+    INIT_ID(latin1), \
+    INIT_ID(leaf_size), \
+    INIT_ID(len), \
+    INIT_ID(length), \
+    INIT_ID(level), \
+    INIT_ID(limit), \
+    INIT_ID(line), \
+    INIT_ID(line_buffering), \
+    INIT_ID(lineno), \
+    INIT_ID(listcomp), \
+    INIT_ID(little), \
+    INIT_ID(lo), \
+    INIT_ID(locale), \
+    INIT_ID(locals), \
+    INIT_ID(logoption), \
+    INIT_ID(loop), \
+    INIT_ID(manual_reset), \
+    INIT_ID(mapping), \
+    INIT_ID(match), \
+    INIT_ID(max_length), \
+    INIT_ID(maxdigits), \
+    INIT_ID(maxevents), \
+    INIT_ID(maxlen), \
+    INIT_ID(maxmem), \
+    INIT_ID(maxsplit), \
+    INIT_ID(maxvalue), \
+    INIT_ID(memLevel), \
+    INIT_ID(memlimit), \
+    INIT_ID(message), \
+    INIT_ID(metaclass), \
+    INIT_ID(metadata), \
+    INIT_ID(method), \
+    INIT_ID(microsecond), \
+    INIT_ID(milliseconds), \
+    INIT_ID(minute), \
+    INIT_ID(mod), \
+    INIT_ID(mode), \
+    INIT_ID(module), \
+    INIT_ID(module_globals), \
+    INIT_ID(modules), \
+    INIT_ID(month), \
+    INIT_ID(mro), \
+    INIT_ID(msg), \
+    INIT_ID(mutex), \
+    INIT_ID(mycmp), \
+    INIT_ID(n_arg), \
+    INIT_ID(n_fields), \
+    INIT_ID(n_sequence_fields), \
+    INIT_ID(n_unnamed_fields), \
+    INIT_ID(name), \
+    INIT_ID(name_from), \
+    INIT_ID(namespace_separator), \
+    INIT_ID(namespaces), \
+    INIT_ID(narg), \
+    INIT_ID(ndigits), \
+    INIT_ID(nested), \
+    INIT_ID(new_file_name), \
+    INIT_ID(new_limit), \
+    INIT_ID(newline), \
+    INIT_ID(newlines), \
+    INIT_ID(next), \
+    INIT_ID(nlocals), \
+    INIT_ID(node_depth), \
+    INIT_ID(node_offset), \
+    INIT_ID(ns), \
+    INIT_ID(nstype), \
+    INIT_ID(nt), \
+    INIT_ID(null), \
+    INIT_ID(number), \
+    INIT_ID(obj), \
+    INIT_ID(object), \
+    INIT_ID(offset), \
+    INIT_ID(offset_dst), \
+    INIT_ID(offset_src), \
+    INIT_ID(on_type_read), \
+    INIT_ID(onceregistry), \
+    INIT_ID(only_keys), \
+    INIT_ID(oparg), \
+    INIT_ID(opcode), \
+    INIT_ID(open), \
+    INIT_ID(opener), \
+    INIT_ID(operation), \
+    INIT_ID(optimize), \
+    INIT_ID(options), \
+    INIT_ID(order), \
+    INIT_ID(origin), \
+    INIT_ID(out_fd), \
+    INIT_ID(outgoing), \
+    INIT_ID(overlapped), \
+    INIT_ID(owner), \
+    INIT_ID(pages), \
+    INIT_ID(parent), \
+    INIT_ID(password), \
+    INIT_ID(path), \
+    INIT_ID(pattern), \
+    INIT_ID(peek), \
+    INIT_ID(persistent_id), \
+    INIT_ID(persistent_load), \
+    INIT_ID(person), \
+    INIT_ID(pi_factory), \
+    INIT_ID(pid), \
+    INIT_ID(policy), \
+    INIT_ID(pos), \
+    INIT_ID(pos1), \
+    INIT_ID(pos2), \
+    INIT_ID(posix), \
+    INIT_ID(print_file_and_line), \
+    INIT_ID(priority), \
+    INIT_ID(progress), \
+    INIT_ID(progress_handler), \
+    INIT_ID(progress_routine), \
+    INIT_ID(proto), \
+    INIT_ID(protocol), \
+    INIT_ID(ps1), \
+    INIT_ID(ps2), \
+    INIT_ID(query), \
+    INIT_ID(quotetabs), \
+    INIT_ID(raw), \
+    INIT_ID(read), \
+    INIT_ID(read1), \
+    INIT_ID(readable), \
+    INIT_ID(readall), \
+    INIT_ID(readinto), \
+    INIT_ID(readinto1), \
+    INIT_ID(readline), \
+    INIT_ID(readonly), \
+    INIT_ID(real), \
+    INIT_ID(reducer_override), \
+    INIT_ID(registry), \
+    INIT_ID(rel_tol), \
+    INIT_ID(release), \
+    INIT_ID(reload), \
+    INIT_ID(repl), \
+    INIT_ID(replace), \
+    INIT_ID(reserved), \
+    INIT_ID(reset), \
+    INIT_ID(resetids), \
+    INIT_ID(return), \
+    INIT_ID(reverse), \
+    INIT_ID(reversed), \
+    INIT_ID(salt), \
+    INIT_ID(sched_priority), \
+    INIT_ID(scheduler), \
+    INIT_ID(second), \
+    INIT_ID(security_attributes), \
+    INIT_ID(seek), \
+    INIT_ID(seekable), \
+    INIT_ID(selectors), \
+    INIT_ID(self), \
+    INIT_ID(send), \
+    INIT_ID(sep), \
+    INIT_ID(sequence), \
+    INIT_ID(server_hostname), \
+    INIT_ID(server_side), \
+    INIT_ID(session), \
+    INIT_ID(setcomp), \
+    INIT_ID(setpgroup), \
+    INIT_ID(setsid), \
+    INIT_ID(setsigdef), \
+    INIT_ID(setsigmask), \
+    INIT_ID(setstate), \
+    INIT_ID(shape), \
+    INIT_ID(show_cmd), \
+    INIT_ID(signed), \
+    INIT_ID(size), \
+    INIT_ID(sizehint), \
+    INIT_ID(skip_file_prefixes), \
+    INIT_ID(sleep), \
+    INIT_ID(sock), \
+    INIT_ID(sort), \
+    INIT_ID(source), \
+    INIT_ID(source_traceback), \
+    INIT_ID(spam), \
+    INIT_ID(src), \
+    INIT_ID(src_dir_fd), \
+    INIT_ID(stacklevel), \
+    INIT_ID(start), \
+    INIT_ID(statement), \
+    INIT_ID(status), \
+    INIT_ID(stderr), \
+    INIT_ID(stdin), \
+    INIT_ID(stdout), \
+    INIT_ID(step), \
+    INIT_ID(steps), \
+    INIT_ID(store_name), \
+    INIT_ID(strategy), \
+    INIT_ID(strftime), \
+    INIT_ID(strict), \
+    INIT_ID(strict_mode), \
+    INIT_ID(string), \
+    INIT_ID(sub_key), \
+    INIT_ID(symmetric_difference_update), \
+    INIT_ID(tabsize), \
+    INIT_ID(tag), \
+    INIT_ID(target), \
+    INIT_ID(target_is_directory), \
+    INIT_ID(task), \
+    INIT_ID(tb_frame), \
+    INIT_ID(tb_lasti), \
+    INIT_ID(tb_lineno), \
+    INIT_ID(tb_next), \
+    INIT_ID(tell), \
+    INIT_ID(template), \
+    INIT_ID(term), \
+    INIT_ID(text), \
+    INIT_ID(threading), \
+    INIT_ID(throw), \
+    INIT_ID(timeout), \
+    INIT_ID(times), \
+    INIT_ID(timetuple), \
+    INIT_ID(top), \
+    INIT_ID(trace_callback), \
+    INIT_ID(traceback), \
+    INIT_ID(trailers), \
+    INIT_ID(translate), \
+    INIT_ID(true), \
+    INIT_ID(truncate), \
+    INIT_ID(twice), \
+    INIT_ID(txt), \
+    INIT_ID(type), \
+    INIT_ID(type_params), \
+    INIT_ID(tz), \
+    INIT_ID(tzinfo), \
+    INIT_ID(tzname), \
+    INIT_ID(uid), \
+    INIT_ID(unlink), \
+    INIT_ID(unraisablehook), \
+    INIT_ID(uri), \
+    INIT_ID(usedforsecurity), \
+    INIT_ID(value), \
+    INIT_ID(values), \
+    INIT_ID(version), \
+    INIT_ID(volume), \
+    INIT_ID(wait_all), \
+    INIT_ID(warn_on_full_buffer), \
+    INIT_ID(warnings), \
+    INIT_ID(warnoptions), \
+    INIT_ID(wbits), \
+    INIT_ID(week), \
+    INIT_ID(weekday), \
+    INIT_ID(which), \
+    INIT_ID(who), \
+    INIT_ID(withdata), \
+    INIT_ID(writable), \
+    INIT_ID(write), \
+    INIT_ID(write_through), \
+    INIT_ID(year), \
+    INIT_ID(zdict), \
+}
+
+#define _Py_str_ascii_INIT { \
+    _PyASCIIObject_INIT("\x00"), \
+    _PyASCIIObject_INIT("\x01"), \
+    _PyASCIIObject_INIT("\x02"), \
+    _PyASCIIObject_INIT("\x03"), \
+    _PyASCIIObject_INIT("\x04"), \
+    _PyASCIIObject_INIT("\x05"), \
+    _PyASCIIObject_INIT("\x06"), \
+    _PyASCIIObject_INIT("\x07"), \
+    _PyASCIIObject_INIT("\x08"), \
+    _PyASCIIObject_INIT("\x09"), \
+    _PyASCIIObject_INIT("\x0a"), \
+    _PyASCIIObject_INIT("\x0b"), \
+    _PyASCIIObject_INIT("\x0c"), \
+    _PyASCIIObject_INIT("\x0d"), \
+    _PyASCIIObject_INIT("\x0e"), \
+    _PyASCIIObject_INIT("\x0f"), \
+    _PyASCIIObject_INIT("\x10"), \
+    _PyASCIIObject_INIT("\x11"), \
+    _PyASCIIObject_INIT("\x12"), \
+    _PyASCIIObject_INIT("\x13"), \
+    _PyASCIIObject_INIT("\x14"), \
+    _PyASCIIObject_INIT("\x15"), \
+    _PyASCIIObject_INIT("\x16"), \
+    _PyASCIIObject_INIT("\x17"), \
+    _PyASCIIObject_INIT("\x18"), \
+    _PyASCIIObject_INIT("\x19"), \
+    _PyASCIIObject_INIT("\x1a"), \
+    _PyASCIIObject_INIT("\x1b"), \
+    _PyASCIIObject_INIT("\x1c"), \
+    _PyASCIIObject_INIT("\x1d"), \
+    _PyASCIIObject_INIT("\x1e"), \
+    _PyASCIIObject_INIT("\x1f"), \
+    _PyASCIIObject_INIT("\x20"), \
+    _PyASCIIObject_INIT("\x21"), \
+    _PyASCIIObject_INIT("\x22"), \
+    _PyASCIIObject_INIT("\x23"), \
+    _PyASCIIObject_INIT("\x24"), \
+    _PyASCIIObject_INIT("\x25"), \
+    _PyASCIIObject_INIT("\x26"), \
+    _PyASCIIObject_INIT("\x27"), \
+    _PyASCIIObject_INIT("\x28"), \
+    _PyASCIIObject_INIT("\x29"), \
+    _PyASCIIObject_INIT("\x2a"), \
+    _PyASCIIObject_INIT("\x2b"), \
+    _PyASCIIObject_INIT("\x2c"), \
+    _PyASCIIObject_INIT("\x2d"), \
+    _PyASCIIObject_INIT("\x2e"), \
+    _PyASCIIObject_INIT("\x2f"), \
+    _PyASCIIObject_INIT("\x30"), \
+    _PyASCIIObject_INIT("\x31"), \
+    _PyASCIIObject_INIT("\x32"), \
+    _PyASCIIObject_INIT("\x33"), \
+    _PyASCIIObject_INIT("\x34"), \
+    _PyASCIIObject_INIT("\x35"), \
+    _PyASCIIObject_INIT("\x36"), \
+    _PyASCIIObject_INIT("\x37"), \
+    _PyASCIIObject_INIT("\x38"), \
+    _PyASCIIObject_INIT("\x39"), \
+    _PyASCIIObject_INIT("\x3a"), \
+    _PyASCIIObject_INIT("\x3b"), \
+    _PyASCIIObject_INIT("\x3c"), \
+    _PyASCIIObject_INIT("\x3d"), \
+    _PyASCIIObject_INIT("\x3e"), \
+    _PyASCIIObject_INIT("\x3f"), \
+    _PyASCIIObject_INIT("\x40"), \
+    _PyASCIIObject_INIT("\x41"), \
+    _PyASCIIObject_INIT("\x42"), \
+    _PyASCIIObject_INIT("\x43"), \
+    _PyASCIIObject_INIT("\x44"), \
+    _PyASCIIObject_INIT("\x45"), \
+    _PyASCIIObject_INIT("\x46"), \
+    _PyASCIIObject_INIT("\x47"), \
+    _PyASCIIObject_INIT("\x48"), \
+    _PyASCIIObject_INIT("\x49"), \
+    _PyASCIIObject_INIT("\x4a"), \
+    _PyASCIIObject_INIT("\x4b"), \
+    _PyASCIIObject_INIT("\x4c"), \
+    _PyASCIIObject_INIT("\x4d"), \
+    _PyASCIIObject_INIT("\x4e"), \
+    _PyASCIIObject_INIT("\x4f"), \
+    _PyASCIIObject_INIT("\x50"), \
+    _PyASCIIObject_INIT("\x51"), \
+    _PyASCIIObject_INIT("\x52"), \
+    _PyASCIIObject_INIT("\x53"), \
+    _PyASCIIObject_INIT("\x54"), \
+    _PyASCIIObject_INIT("\x55"), \
+    _PyASCIIObject_INIT("\x56"), \
+    _PyASCIIObject_INIT("\x57"), \
+    _PyASCIIObject_INIT("\x58"), \
+    _PyASCIIObject_INIT("\x59"), \
+    _PyASCIIObject_INIT("\x5a"), \
+    _PyASCIIObject_INIT("\x5b"), \
+    _PyASCIIObject_INIT("\x5c"), \
+    _PyASCIIObject_INIT("\x5d"), \
+    _PyASCIIObject_INIT("\x5e"), \
+    _PyASCIIObject_INIT("\x5f"), \
+    _PyASCIIObject_INIT("\x60"), \
+    _PyASCIIObject_INIT("\x61"), \
+    _PyASCIIObject_INIT("\x62"), \
+    _PyASCIIObject_INIT("\x63"), \
+    _PyASCIIObject_INIT("\x64"), \
+    _PyASCIIObject_INIT("\x65"), \
+    _PyASCIIObject_INIT("\x66"), \
+    _PyASCIIObject_INIT("\x67"), \
+    _PyASCIIObject_INIT("\x68"), \
+    _PyASCIIObject_INIT("\x69"), \
+    _PyASCIIObject_INIT("\x6a"), \
+    _PyASCIIObject_INIT("\x6b"), \
+    _PyASCIIObject_INIT("\x6c"), \
+    _PyASCIIObject_INIT("\x6d"), \
+    _PyASCIIObject_INIT("\x6e"), \
+    _PyASCIIObject_INIT("\x6f"), \
+    _PyASCIIObject_INIT("\x70"), \
+    _PyASCIIObject_INIT("\x71"), \
+    _PyASCIIObject_INIT("\x72"), \
+    _PyASCIIObject_INIT("\x73"), \
+    _PyASCIIObject_INIT("\x74"), \
+    _PyASCIIObject_INIT("\x75"), \
+    _PyASCIIObject_INIT("\x76"), \
+    _PyASCIIObject_INIT("\x77"), \
+    _PyASCIIObject_INIT("\x78"), \
+    _PyASCIIObject_INIT("\x79"), \
+    _PyASCIIObject_INIT("\x7a"), \
+    _PyASCIIObject_INIT("\x7b"), \
+    _PyASCIIObject_INIT("\x7c"), \
+    _PyASCIIObject_INIT("\x7d"), \
+    _PyASCIIObject_INIT("\x7e"), \
+    _PyASCIIObject_INIT("\x7f"), \
+}
+
+#define _Py_str_latin1_INIT { \
+    _PyUnicode_LATIN1_INIT("\x80", "\xc2\x80"), \
+    _PyUnicode_LATIN1_INIT("\x81", "\xc2\x81"), \
+    _PyUnicode_LATIN1_INIT("\x82", "\xc2\x82"), \
+    _PyUnicode_LATIN1_INIT("\x83", "\xc2\x83"), \
+    _PyUnicode_LATIN1_INIT("\x84", "\xc2\x84"), \
+    _PyUnicode_LATIN1_INIT("\x85", "\xc2\x85"), \
+    _PyUnicode_LATIN1_INIT("\x86", "\xc2\x86"), \
+    _PyUnicode_LATIN1_INIT("\x87", "\xc2\x87"), \
+    _PyUnicode_LATIN1_INIT("\x88", "\xc2\x88"), \
+    _PyUnicode_LATIN1_INIT("\x89", "\xc2\x89"), \
+    _PyUnicode_LATIN1_INIT("\x8a", "\xc2\x8a"), \
+    _PyUnicode_LATIN1_INIT("\x8b", "\xc2\x8b"), \
+    _PyUnicode_LATIN1_INIT("\x8c", "\xc2\x8c"), \
+    _PyUnicode_LATIN1_INIT("\x8d", "\xc2\x8d"), \
+    _PyUnicode_LATIN1_INIT("\x8e", "\xc2\x8e"), \
+    _PyUnicode_LATIN1_INIT("\x8f", "\xc2\x8f"), \
+    _PyUnicode_LATIN1_INIT("\x90", "\xc2\x90"), \
+    _PyUnicode_LATIN1_INIT("\x91", "\xc2\x91"), \
+    _PyUnicode_LATIN1_INIT("\x92", "\xc2\x92"), \
+    _PyUnicode_LATIN1_INIT("\x93", "\xc2\x93"), \
+    _PyUnicode_LATIN1_INIT("\x94", "\xc2\x94"), \
+    _PyUnicode_LATIN1_INIT("\x95", "\xc2\x95"), \
+    _PyUnicode_LATIN1_INIT("\x96", "\xc2\x96"), \
+    _PyUnicode_LATIN1_INIT("\x97", "\xc2\x97"), \
+    _PyUnicode_LATIN1_INIT("\x98", "\xc2\x98"), \
+    _PyUnicode_LATIN1_INIT("\x99", "\xc2\x99"), \
+    _PyUnicode_LATIN1_INIT("\x9a", "\xc2\x9a"), \
+    _PyUnicode_LATIN1_INIT("\x9b", "\xc2\x9b"), \
+    _PyUnicode_LATIN1_INIT("\x9c", "\xc2\x9c"), \
+    _PyUnicode_LATIN1_INIT("\x9d", "\xc2\x9d"), \
+    _PyUnicode_LATIN1_INIT("\x9e", "\xc2\x9e"), \
+    _PyUnicode_LATIN1_INIT("\x9f", "\xc2\x9f"), \
+    _PyUnicode_LATIN1_INIT("\xa0", "\xc2\xa0"), \
+    _PyUnicode_LATIN1_INIT("\xa1", "\xc2\xa1"), \
+    _PyUnicode_LATIN1_INIT("\xa2", "\xc2\xa2"), \
+    _PyUnicode_LATIN1_INIT("\xa3", "\xc2\xa3"), \
+    _PyUnicode_LATIN1_INIT("\xa4", "\xc2\xa4"), \
+    _PyUnicode_LATIN1_INIT("\xa5", "\xc2\xa5"), \
+    _PyUnicode_LATIN1_INIT("\xa6", "\xc2\xa6"), \
+    _PyUnicode_LATIN1_INIT("\xa7", "\xc2\xa7"), \
+    _PyUnicode_LATIN1_INIT("\xa8", "\xc2\xa8"), \
+    _PyUnicode_LATIN1_INIT("\xa9", "\xc2\xa9"), \
+    _PyUnicode_LATIN1_INIT("\xaa", "\xc2\xaa"), \
+    _PyUnicode_LATIN1_INIT("\xab", "\xc2\xab"), \
+    _PyUnicode_LATIN1_INIT("\xac", "\xc2\xac"), \
+    _PyUnicode_LATIN1_INIT("\xad", "\xc2\xad"), \
+    _PyUnicode_LATIN1_INIT("\xae", "\xc2\xae"), \
+    _PyUnicode_LATIN1_INIT("\xaf", "\xc2\xaf"), \
+    _PyUnicode_LATIN1_INIT("\xb0", "\xc2\xb0"), \
+    _PyUnicode_LATIN1_INIT("\xb1", "\xc2\xb1"), \
+    _PyUnicode_LATIN1_INIT("\xb2", "\xc2\xb2"), \
+    _PyUnicode_LATIN1_INIT("\xb3", "\xc2\xb3"), \
+    _PyUnicode_LATIN1_INIT("\xb4", "\xc2\xb4"), \
+    _PyUnicode_LATIN1_INIT("\xb5", "\xc2\xb5"), \
+    _PyUnicode_LATIN1_INIT("\xb6", "\xc2\xb6"), \
+    _PyUnicode_LATIN1_INIT("\xb7", "\xc2\xb7"), \
+    _PyUnicode_LATIN1_INIT("\xb8", "\xc2\xb8"), \
+    _PyUnicode_LATIN1_INIT("\xb9", "\xc2\xb9"), \
+    _PyUnicode_LATIN1_INIT("\xba", "\xc2\xba"), \
+    _PyUnicode_LATIN1_INIT("\xbb", "\xc2\xbb"), \
+    _PyUnicode_LATIN1_INIT("\xbc", "\xc2\xbc"), \
+    _PyUnicode_LATIN1_INIT("\xbd", "\xc2\xbd"), \
+    _PyUnicode_LATIN1_INIT("\xbe", "\xc2\xbe"), \
+    _PyUnicode_LATIN1_INIT("\xbf", "\xc2\xbf"), \
+    _PyUnicode_LATIN1_INIT("\xc0", "\xc3\x80"), \
+    _PyUnicode_LATIN1_INIT("\xc1", "\xc3\x81"), \
+    _PyUnicode_LATIN1_INIT("\xc2", "\xc3\x82"), \
+    _PyUnicode_LATIN1_INIT("\xc3", "\xc3\x83"), \
+    _PyUnicode_LATIN1_INIT("\xc4", "\xc3\x84"), \
+    _PyUnicode_LATIN1_INIT("\xc5", "\xc3\x85"), \
+    _PyUnicode_LATIN1_INIT("\xc6", "\xc3\x86"), \
+    _PyUnicode_LATIN1_INIT("\xc7", "\xc3\x87"), \
+    _PyUnicode_LATIN1_INIT("\xc8", "\xc3\x88"), \
+    _PyUnicode_LATIN1_INIT("\xc9", "\xc3\x89"), \
+    _PyUnicode_LATIN1_INIT("\xca", "\xc3\x8a"), \
+    _PyUnicode_LATIN1_INIT("\xcb", "\xc3\x8b"), \
+    _PyUnicode_LATIN1_INIT("\xcc", "\xc3\x8c"), \
+    _PyUnicode_LATIN1_INIT("\xcd", "\xc3\x8d"), \
+    _PyUnicode_LATIN1_INIT("\xce", "\xc3\x8e"), \
+    _PyUnicode_LATIN1_INIT("\xcf", "\xc3\x8f"), \
+    _PyUnicode_LATIN1_INIT("\xd0", "\xc3\x90"), \
+    _PyUnicode_LATIN1_INIT("\xd1", "\xc3\x91"), \
+    _PyUnicode_LATIN1_INIT("\xd2", "\xc3\x92"), \
+    _PyUnicode_LATIN1_INIT("\xd3", "\xc3\x93"), \
+    _PyUnicode_LATIN1_INIT("\xd4", "\xc3\x94"), \
+    _PyUnicode_LATIN1_INIT("\xd5", "\xc3\x95"), \
+    _PyUnicode_LATIN1_INIT("\xd6", "\xc3\x96"), \
+    _PyUnicode_LATIN1_INIT("\xd7", "\xc3\x97"), \
+    _PyUnicode_LATIN1_INIT("\xd8", "\xc3\x98"), \
+    _PyUnicode_LATIN1_INIT("\xd9", "\xc3\x99"), \
+    _PyUnicode_LATIN1_INIT("\xda", "\xc3\x9a"), \
+    _PyUnicode_LATIN1_INIT("\xdb", "\xc3\x9b"), \
+    _PyUnicode_LATIN1_INIT("\xdc", "\xc3\x9c"), \
+    _PyUnicode_LATIN1_INIT("\xdd", "\xc3\x9d"), \
+    _PyUnicode_LATIN1_INIT("\xde", "\xc3\x9e"), \
+    _PyUnicode_LATIN1_INIT("\xdf", "\xc3\x9f"), \
+    _PyUnicode_LATIN1_INIT("\xe0", "\xc3\xa0"), \
+    _PyUnicode_LATIN1_INIT("\xe1", "\xc3\xa1"), \
+    _PyUnicode_LATIN1_INIT("\xe2", "\xc3\xa2"), \
+    _PyUnicode_LATIN1_INIT("\xe3", "\xc3\xa3"), \
+    _PyUnicode_LATIN1_INIT("\xe4", "\xc3\xa4"), \
+    _PyUnicode_LATIN1_INIT("\xe5", "\xc3\xa5"), \
+    _PyUnicode_LATIN1_INIT("\xe6", "\xc3\xa6"), \
+    _PyUnicode_LATIN1_INIT("\xe7", "\xc3\xa7"), \
+    _PyUnicode_LATIN1_INIT("\xe8", "\xc3\xa8"), \
+    _PyUnicode_LATIN1_INIT("\xe9", "\xc3\xa9"), \
+    _PyUnicode_LATIN1_INIT("\xea", "\xc3\xaa"), \
+    _PyUnicode_LATIN1_INIT("\xeb", "\xc3\xab"), \
+    _PyUnicode_LATIN1_INIT("\xec", "\xc3\xac"), \
+    _PyUnicode_LATIN1_INIT("\xed", "\xc3\xad"), \
+    _PyUnicode_LATIN1_INIT("\xee", "\xc3\xae"), \
+    _PyUnicode_LATIN1_INIT("\xef", "\xc3\xaf"), \
+    _PyUnicode_LATIN1_INIT("\xf0", "\xc3\xb0"), \
+    _PyUnicode_LATIN1_INIT("\xf1", "\xc3\xb1"), \
+    _PyUnicode_LATIN1_INIT("\xf2", "\xc3\xb2"), \
+    _PyUnicode_LATIN1_INIT("\xf3", "\xc3\xb3"), \
+    _PyUnicode_LATIN1_INIT("\xf4", "\xc3\xb4"), \
+    _PyUnicode_LATIN1_INIT("\xf5", "\xc3\xb5"), \
+    _PyUnicode_LATIN1_INIT("\xf6", "\xc3\xb6"), \
+    _PyUnicode_LATIN1_INIT("\xf7", "\xc3\xb7"), \
+    _PyUnicode_LATIN1_INIT("\xf8", "\xc3\xb8"), \
+    _PyUnicode_LATIN1_INIT("\xf9", "\xc3\xb9"), \
+    _PyUnicode_LATIN1_INIT("\xfa", "\xc3\xba"), \
+    _PyUnicode_LATIN1_INIT("\xfb", "\xc3\xbb"), \
+    _PyUnicode_LATIN1_INIT("\xfc", "\xc3\xbc"), \
+    _PyUnicode_LATIN1_INIT("\xfd", "\xc3\xbd"), \
+    _PyUnicode_LATIN1_INIT("\xfe", "\xc3\xbe"), \
+    _PyUnicode_LATIN1_INIT("\xff", "\xc3\xbf"), \
+}
+/* End auto-generated code */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_RUNTIME_INIT_GENERATED_H */
diff --git a/Include/internal/pycore_semaphore.h b/Include/internal/pycore_semaphore.h
new file mode 100644
index 0000000000000000000000000000000000000000..269538384606ce1b031a75f928038e253c5efce4
--- /dev/null
+++ b/Include/internal/pycore_semaphore.h
@@ -0,0 +1,67 @@
+// The _PySemaphore API a simplified cross-platform semaphore used to implement
+// wakeup/sleep.
+#ifndef Py_INTERNAL_SEMAPHORE_H
+#define Py_INTERNAL_SEMAPHORE_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_pythread.h"      // _POSIX_SEMAPHORES
+
+#ifdef MS_WINDOWS
+#   ifndef WIN32_LEAN_AND_MEAN
+#       define WIN32_LEAN_AND_MEAN
+#   endif
+#   include <windows.h>
+#elif defined(HAVE_PTHREAD_H)
+#   include <pthread.h>
+#elif defined(HAVE_PTHREAD_STUBS)
+#   include "cpython/pthread_stubs.h"
+#else
+#   error "Require native threads. See https://bugs.python.org/issue31370"
+#endif
+
+#if (defined(_POSIX_SEMAPHORES) && (_POSIX_SEMAPHORES+0) != -1 && \
+        defined(HAVE_SEM_TIMEDWAIT))
+#   define _Py_USE_SEMAPHORES
+#   include <semaphore.h>
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _PySemaphore {
+#if defined(MS_WINDOWS)
+    HANDLE platform_sem;
+#elif defined(_Py_USE_SEMAPHORES)
+    sem_t platform_sem;
+#else
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    int counter;
+#endif
+} _PySemaphore;
+
+// Puts the current thread to sleep until _PySemaphore_Wakeup() is called.
+// If `detach` is true, then the thread will detach/release the GIL while
+// sleeping.
+PyAPI_FUNC(int)
+_PySemaphore_Wait(_PySemaphore *sema, PyTime_t timeout_ns, int detach);
+
+// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup()
+// can be called before _PySemaphore_Wait().
+PyAPI_FUNC(void)
+_PySemaphore_Wakeup(_PySemaphore *sema);
+
+// Initializes/destroys a semaphore
+PyAPI_FUNC(void) _PySemaphore_Init(_PySemaphore *sema);
+PyAPI_FUNC(void) _PySemaphore_Destroy(_PySemaphore *sema);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_SEMAPHORE_H */
diff --git a/Include/internal/pycore_setobject.h b/Include/internal/pycore_setobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..0494c07fe1869d3e821247be921c28af0cf60f8c
--- /dev/null
+++ b/Include/internal/pycore_setobject.h
@@ -0,0 +1,39 @@
+#ifndef Py_INTERNAL_SETOBJECT_H
+#define Py_INTERNAL_SETOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Export for '_abc' shared extension
+PyAPI_FUNC(int) _PySet_NextEntry(
+    PyObject *set,
+    Py_ssize_t *pos,
+    PyObject **key,
+    Py_hash_t *hash);
+
+// Export for '_pickle' shared extension
+PyAPI_FUNC(int) _PySet_NextEntryRef(
+    PyObject *set,
+    Py_ssize_t *pos,
+    PyObject **key,
+    Py_hash_t *hash);
+
+// Export for '_pickle' shared extension
+PyAPI_FUNC(int) _PySet_Update(PyObject *set, PyObject *iterable);
+
+// Export for the gdb plugin's (python-gdb.py) benefit
+PyAPI_DATA(PyObject *) _PySet_Dummy;
+
+PyAPI_FUNC(int) _PySet_Contains(PySetObject *so, PyObject *key);
+
+// Clears the set without acquiring locks. Used by _PyCode_Fini.
+extern void _PySet_ClearInternal(PySetObject *so);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_SETOBJECT_H
diff --git a/Include/internal/pycore_signal.h b/Include/internal/pycore_signal.h
new file mode 100644
index 0000000000000000000000000000000000000000..47213a34ab77b526be8ef0672720811d519c8856
--- /dev/null
+++ b/Include/internal/pycore_signal.h
@@ -0,0 +1,108 @@
+// Define Py_NSIG constant for signal handling.
+
+#ifndef Py_INTERNAL_SIGNAL_H
+#define Py_INTERNAL_SIGNAL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <signal.h>               // NSIG
+
+
+// Restore signals that the interpreter has called SIG_IGN on to SIG_DFL.
+// Export for '_posixsubprocess' shared extension.
+PyAPI_FUNC(void) _Py_RestoreSignals(void);
+
+#ifdef _SIG_MAXSIG
+   // gh-91145: On FreeBSD, <signal.h> defines NSIG as 32: it doesn't include
+   // realtime signals: [SIGRTMIN,SIGRTMAX]. Use _SIG_MAXSIG instead. For
+   // example on x86-64 FreeBSD 13, SIGRTMAX is 126 and _SIG_MAXSIG is 128.
+#  define Py_NSIG _SIG_MAXSIG
+#elif defined(NSIG)
+#  define Py_NSIG NSIG
+#elif defined(_NSIG)
+#  define Py_NSIG _NSIG            // BSD/SysV
+#elif defined(_SIGMAX)
+#  define Py_NSIG (_SIGMAX + 1)    // QNX
+#elif defined(SIGMAX)
+#  define Py_NSIG (SIGMAX + 1)     // djgpp
+#else
+#  define Py_NSIG 64               // Use a reasonable default value
+#endif
+
+#define INVALID_FD (-1)
+
+struct _signals_runtime_state {
+    struct {
+        // tripped and func should be accessed using atomic ops.
+        int tripped;
+        PyObject* func;
+    } handlers[Py_NSIG];
+
+    volatile struct {
+#ifdef MS_WINDOWS
+        /* This would be "SOCKET fd" if <winsock2.h> were always included.
+           It isn't so we must cast to SOCKET where appropriate. */
+        volatile int fd;
+#elif defined(__VXWORKS__)
+        int fd;
+#else
+        sig_atomic_t fd;
+#endif
+
+        int warn_on_full_buffer;
+#ifdef MS_WINDOWS
+        int use_send;
+#endif
+    } wakeup;
+
+    /* Speed up sigcheck() when none tripped.
+       is_tripped should be accessed using atomic ops. */
+    int is_tripped;
+
+    /* These objects necessarily belong to the main interpreter. */
+    PyObject *default_handler;
+    PyObject *ignore_handler;
+
+#ifdef MS_WINDOWS
+    /* This would be "HANDLE sigint_event" if <windows.h> were always included.
+       It isn't so we must cast to HANDLE everywhere "sigint_event" is used. */
+    void *sigint_event;
+#endif
+
+    /* True if the main interpreter thread exited due to an unhandled
+     * KeyboardInterrupt exception, suggesting the user pressed ^C. */
+    int unhandled_keyboard_interrupt;
+};
+
+#ifdef MS_WINDOWS
+# define _signals_WAKEUP_INIT \
+    {.fd = INVALID_FD, .warn_on_full_buffer = 1, .use_send = 0}
+#else
+# define _signals_WAKEUP_INIT \
+    {.fd = INVALID_FD, .warn_on_full_buffer = 1}
+#endif
+
+#define _signals_RUNTIME_INIT \
+    { \
+        .wakeup = _signals_WAKEUP_INIT, \
+    }
+
+
+// Export for '_multiprocessing' shared extension
+PyAPI_FUNC(int) _PyOS_IsMainThread(void);
+
+#ifdef MS_WINDOWS
+// <windows.h> is not included by Python.h so use void* instead of HANDLE.
+// Export for '_multiprocessing' shared extension
+PyAPI_FUNC(void*) _PyOS_SigintEvent(void);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_SIGNAL_H
diff --git a/Include/internal/pycore_sliceobject.h b/Include/internal/pycore_sliceobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba8b1f1cb27dee3219436416586b2bb3e3b8d4e3
--- /dev/null
+++ b/Include/internal/pycore_sliceobject.h
@@ -0,0 +1,20 @@
+#ifndef Py_INTERNAL_SLICEOBJECT_H
+#define Py_INTERNAL_SLICEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* runtime lifecycle */
+
+PyAPI_FUNC(PyObject *)
+_PyBuildSlice_ConsumeRefs(PyObject *start, PyObject *stop);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_SLICEOBJECT_H */
diff --git a/Include/internal/pycore_stackref.h b/Include/internal/pycore_stackref.h
new file mode 100644
index 0000000000000000000000000000000000000000..93898174789f7b3b937e6be6957c5ac3178758ca
--- /dev/null
+++ b/Include/internal/pycore_stackref.h
@@ -0,0 +1,195 @@
+#ifndef Py_INTERNAL_STACKREF_H
+#define Py_INTERNAL_STACKREF_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <stddef.h>
+
+typedef union {
+    uintptr_t bits;
+} _PyStackRef;
+
+static const _PyStackRef Py_STACKREF_NULL = { .bits = 0 };
+
+#define Py_TAG_DEFERRED (1)
+
+// Gets a PyObject * from a _PyStackRef
+#if defined(Py_GIL_DISABLED)
+static inline PyObject *
+PyStackRef_Get(_PyStackRef tagged)
+{
+    PyObject *cleared = ((PyObject *)((tagged).bits & (~Py_TAG_DEFERRED)));
+    return cleared;
+}
+#else
+#   define PyStackRef_Get(tagged) ((PyObject *)((tagged).bits))
+#endif
+
+// Converts a PyObject * to a PyStackRef, stealing the reference.
+#if defined(Py_GIL_DISABLED)
+static inline _PyStackRef
+_PyStackRef_StealRef(PyObject *obj)
+{
+    // Make sure we don't take an already tagged value.
+    assert(((uintptr_t)obj & Py_TAG_DEFERRED) == 0);
+    return ((_PyStackRef){.bits = ((uintptr_t)(obj))});
+}
+#   define PyStackRef_StealRef(obj) _PyStackRef_StealRef(_PyObject_CAST(obj))
+#else
+#   define PyStackRef_StealRef(obj) ((_PyStackRef){.bits = ((uintptr_t)(obj))})
+#endif
+
+// Converts a PyObject * to a PyStackRef, with a new reference
+#if defined(Py_GIL_DISABLED)
+static inline _PyStackRef
+_PyStackRef_NewRefDeferred(PyObject *obj)
+{
+    // Make sure we don't take an already tagged value.
+    assert(((uintptr_t)obj & Py_TAG_DEFERRED) == 0);
+    assert(obj != NULL);
+    if (_PyObject_HasDeferredRefcount(obj)) {
+        return (_PyStackRef){ .bits = (uintptr_t)obj | Py_TAG_DEFERRED };
+    }
+    else {
+        return (_PyStackRef){ .bits = (uintptr_t)Py_NewRef(obj) };
+    }
+}
+#   define PyStackRef_NewRefDeferred(obj) _PyStackRef_NewRefDeferred(_PyObject_CAST(obj))
+#else
+#   define PyStackRef_NewRefDeferred(obj) PyStackRef_NewRef(((_PyStackRef){.bits = ((uintptr_t)(obj))}))
+#endif
+
+#if defined(Py_GIL_DISABLED)
+static inline _PyStackRef
+_PyStackRef_XNewRefDeferred(PyObject *obj)
+{
+    // Make sure we don't take an already tagged value.
+    assert(((uintptr_t)obj & Py_TAG_DEFERRED) == 0);
+    if (obj == NULL) {
+        return Py_STACKREF_NULL;
+    }
+    return _PyStackRef_NewRefDeferred(obj);
+}
+#   define PyStackRef_XNewRefDeferred(obj) _PyStackRef_XNewRefDeferred(_PyObject_CAST(obj))
+#else
+#   define PyStackRef_XNewRefDeferred(obj) PyStackRef_XNewRef(((_PyStackRef){.bits = ((uintptr_t)(obj))}))
+#endif
+
+// Converts a PyStackRef back to a PyObject *.
+#if defined(Py_GIL_DISABLED)
+static inline PyObject *
+PyStackRef_StealObject(_PyStackRef tagged)
+{
+    if ((tagged.bits & Py_TAG_DEFERRED) == Py_TAG_DEFERRED) {
+        assert(_PyObject_HasDeferredRefcount(PyStackRef_Get(tagged)));
+        return Py_NewRef(PyStackRef_Get(tagged));
+    }
+    return PyStackRef_Get(tagged);
+}
+#else
+#   define PyStackRef_StealObject(tagged) PyStackRef_Get(tagged)
+#endif
+
+static inline void
+_Py_untag_stack_borrowed(PyObject **dst, const _PyStackRef *src, size_t length)
+{
+    for (size_t i = 0; i < length; i++) {
+        dst[i] = PyStackRef_Get(src[i]);
+    }
+}
+
+static inline void
+_Py_untag_stack_steal(PyObject **dst, const _PyStackRef *src, size_t length)
+{
+    for (size_t i = 0; i < length; i++) {
+        dst[i] = PyStackRef_StealObject(src[i]);
+    }
+}
+
+
+#define PyStackRef_XSETREF(dst, src) \
+    do { \
+        _PyStackRef *_tmp_dst_ptr = &(dst); \
+        _PyStackRef _tmp_old_dst = (*_tmp_dst_ptr); \
+        *_tmp_dst_ptr = (src); \
+        PyStackRef_XDECREF(_tmp_old_dst); \
+    } while (0)
+
+#define PyStackRef_SETREF(dst, src) \
+    do { \
+        _PyStackRef *_tmp_dst_ptr = &(dst); \
+        _PyStackRef _tmp_old_dst = (*_tmp_dst_ptr); \
+        *_tmp_dst_ptr = (src); \
+        PyStackRef_DECREF(_tmp_old_dst); \
+    } while (0)
+
+#define PyStackRef_CLEAR(op) \
+    do { \
+        _PyStackRef *_tmp_op_ptr = &(op); \
+        _PyStackRef _tmp_old_op = (*_tmp_op_ptr); \
+        if (_tmp_old_op.bits != Py_STACKREF_NULL.bits) { \
+            *_tmp_op_ptr = Py_STACKREF_NULL; \
+            PyStackRef_DECREF(_tmp_old_op); \
+        } \
+    } while (0)
+
+#if defined(Py_GIL_DISABLED)
+static inline void
+PyStackRef_DECREF(_PyStackRef tagged)
+{
+    if ((tagged.bits & Py_TAG_DEFERRED) == Py_TAG_DEFERRED) {
+        return;
+    }
+    Py_DECREF(PyStackRef_Get(tagged));
+}
+#else
+#   define PyStackRef_DECREF(op) Py_DECREF(PyStackRef_Get(op))
+#endif
+
+#if defined(Py_GIL_DISABLED)
+static inline void
+PyStackRef_INCREF(_PyStackRef tagged)
+{
+    if ((tagged.bits & Py_TAG_DEFERRED) == Py_TAG_DEFERRED) {
+        assert(_PyObject_HasDeferredRefcount(PyStackRef_Get(tagged)));
+        return;
+    }
+    Py_INCREF(PyStackRef_Get(tagged));
+}
+#else
+#   define PyStackRef_INCREF(op) Py_INCREF(PyStackRef_Get(op))
+#endif
+
+static inline void
+PyStackRef_XDECREF(_PyStackRef op)
+{
+    if (op.bits != Py_STACKREF_NULL.bits) {
+        PyStackRef_DECREF(op);
+    }
+}
+
+static inline _PyStackRef
+PyStackRef_NewRef(_PyStackRef obj)
+{
+    PyStackRef_INCREF(obj);
+    return obj;
+}
+
+static inline _PyStackRef
+PyStackRef_XNewRef(_PyStackRef obj)
+{
+    if (obj.bits == Py_STACKREF_NULL.bits) {
+        return obj;
+    }
+    return PyStackRef_NewRef(obj);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_STACKREF_H */
diff --git a/Include/internal/pycore_strhex.h b/Include/internal/pycore_strhex.h
new file mode 100644
index 0000000000000000000000000000000000000000..225f423912f2c27eabe64de90a3f9b916cb03577
--- /dev/null
+++ b/Include/internal/pycore_strhex.h
@@ -0,0 +1,39 @@
+#ifndef Py_INTERNAL_STRHEX_H
+#define Py_INTERNAL_STRHEX_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Returns a str() containing the hex representation of argbuf.
+// Export for '_hashlib' shared extension.
+PyAPI_FUNC(PyObject*) _Py_strhex(const
+    char* argbuf,
+    const Py_ssize_t arglen);
+
+// Returns a bytes() containing the ASCII hex representation of argbuf.
+extern PyObject* _Py_strhex_bytes(
+    const char* argbuf,
+    const Py_ssize_t arglen);
+
+// These variants include support for a separator between every N bytes:
+extern PyObject* _Py_strhex_with_sep(
+    const char* argbuf,
+    const Py_ssize_t arglen,
+    PyObject* sep,
+    const int bytes_per_group);
+
+// Export for 'binascii' shared extension
+PyAPI_FUNC(PyObject*) _Py_strhex_bytes_with_sep(
+    const char* argbuf,
+    const Py_ssize_t arglen,
+    PyObject* sep,
+    const int bytes_per_group);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_STRHEX_H */
diff --git a/Include/internal/pycore_structseq.h b/Include/internal/pycore_structseq.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cff165627502bd865abda793c469c4eeb01f588
--- /dev/null
+++ b/Include/internal/pycore_structseq.h
@@ -0,0 +1,40 @@
+#ifndef Py_INTERNAL_STRUCTSEQ_H
+#define Py_INTERNAL_STRUCTSEQ_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* other API */
+
+// Export for '_curses' shared extension
+PyAPI_FUNC(PyTypeObject*) _PyStructSequence_NewType(
+    PyStructSequence_Desc *desc,
+    unsigned long tp_flags);
+
+extern int _PyStructSequence_InitBuiltinWithFlags(
+    PyInterpreterState *interp,
+    PyTypeObject *type,
+    PyStructSequence_Desc *desc,
+    unsigned long tp_flags);
+
+static inline int
+_PyStructSequence_InitBuiltin(PyInterpreterState *interp,
+                              PyTypeObject *type,
+                              PyStructSequence_Desc *desc)
+{
+    return _PyStructSequence_InitBuiltinWithFlags(interp, type, desc, 0);
+}
+
+extern void _PyStructSequence_FiniBuiltin(
+    PyInterpreterState *interp,
+    PyTypeObject *type);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_STRUCTSEQ_H */
diff --git a/Include/internal/pycore_symtable.h b/Include/internal/pycore_symtable.h
new file mode 100644
index 0000000000000000000000000000000000000000..90252bf8365443774f7c40cc523662f50c7e2d39
--- /dev/null
+++ b/Include/internal/pycore_symtable.h
@@ -0,0 +1,204 @@
+#ifndef Py_INTERNAL_SYMTABLE_H
+#define Py_INTERNAL_SYMTABLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+struct _mod;   // Type defined in pycore_ast.h
+
+typedef enum _block_type {
+    FunctionBlock, ClassBlock, ModuleBlock,
+    // Used for annotations if 'from __future__ import annotations' is active.
+    // Annotation blocks cannot bind names and are not evaluated.
+    AnnotationBlock,
+
+    // The following blocks are used for generics and type aliases. These work
+    // mostly like functions (see PEP 695 for details). The three different
+    // blocks function identically; they are different enum entries only so
+    // that error messages can be more precise.
+
+    // The block to enter when processing a "type" (PEP 695) construction,
+    // e.g., "type MyGeneric[T] = list[T]".
+    TypeAliasBlock,
+    // The block to enter when processing a "generic" (PEP 695) object,
+    // e.g., "def foo[T](): pass" or "class A[T]: pass".
+    TypeParametersBlock,
+    // The block to enter when processing the bound, the constraint tuple
+    // or the default value of a single "type variable" in the formal sense,
+    // i.e., a TypeVar, a TypeVarTuple or a ParamSpec object (the latter two
+    // do not support a bound or a constraint tuple).
+    TypeVariableBlock,
+} _Py_block_ty;
+
+typedef enum _comprehension_type {
+    NoComprehension = 0,
+    ListComprehension = 1,
+    DictComprehension = 2,
+    SetComprehension = 3,
+    GeneratorExpression = 4 } _Py_comprehension_ty;
+
+/* source location information */
+typedef struct {
+    int lineno;
+    int end_lineno;
+    int col_offset;
+    int end_col_offset;
+} _Py_SourceLocation;
+
+#define SRC_LOCATION_FROM_AST(n) \
+    (_Py_SourceLocation){ \
+               .lineno = (n)->lineno, \
+               .end_lineno = (n)->end_lineno, \
+               .col_offset = (n)->col_offset, \
+               .end_col_offset = (n)->end_col_offset }
+
+static const _Py_SourceLocation NO_LOCATION = {-1, -1, -1, -1};
+
+/* __future__ information */
+typedef struct {
+    int ff_features;                    /* flags set by future statements */
+    _Py_SourceLocation ff_location;     /* location of last future statement */
+} _PyFutureFeatures;
+
+struct _symtable_entry;
+
+struct symtable {
+    PyObject *st_filename;          /* name of file being compiled,
+                                       decoded from the filesystem encoding */
+    struct _symtable_entry *st_cur; /* current symbol table entry */
+    struct _symtable_entry *st_top; /* symbol table entry for module */
+    PyObject *st_blocks;            /* dict: map AST node addresses
+                                     *       to symbol table entries */
+    PyObject *st_stack;             /* list: stack of namespace info */
+    PyObject *st_global;            /* borrowed ref to st_top->ste_symbols */
+    int st_nblocks;                 /* number of blocks used. kept for
+                                       consistency with the corresponding
+                                       compiler structure */
+    PyObject *st_private;           /* name of current class or NULL */
+    _PyFutureFeatures *st_future;   /* module's future features that affect
+                                       the symbol table */
+    int recursion_depth;            /* current recursion depth */
+    int recursion_limit;            /* recursion limit */
+};
+
+typedef struct _symtable_entry {
+    PyObject_HEAD
+    PyObject *ste_id;        /* int: key in ste_table->st_blocks */
+    PyObject *ste_symbols;   /* dict: variable names to flags */
+    PyObject *ste_name;      /* string: name of current block */
+    PyObject *ste_varnames;  /* list of function parameters */
+    PyObject *ste_children;  /* list of child blocks */
+    PyObject *ste_directives;/* locations of global and nonlocal statements */
+    PyObject *ste_mangled_names; /* set of names for which mangling should be applied */
+
+    _Py_block_ty ste_type;
+    // Optional string set by symtable.c and used when reporting errors.
+    // The content of that string is a description of the current "context".
+    //
+    // For instance, if we are processing the default value of the type
+    // variable "T" in "def foo[T = int](): pass", `ste_scope_info` is
+    // set to "a TypeVar default".
+    const char *ste_scope_info;
+
+    int ste_nested;      /* true if block is nested */
+    unsigned ste_free : 1;        /* true if block has free variables */
+    unsigned ste_child_free : 1;  /* true if a child block has free vars,
+                                     including free refs to globals */
+    unsigned ste_generator : 1;   /* true if namespace is a generator */
+    unsigned ste_coroutine : 1;   /* true if namespace is a coroutine */
+    _Py_comprehension_ty ste_comprehension;  /* Kind of comprehension (if any) */
+    unsigned ste_varargs : 1;     /* true if block has varargs */
+    unsigned ste_varkeywords : 1; /* true if block has varkeywords */
+    unsigned ste_returns_value : 1;  /* true if namespace uses return with
+                                        an argument */
+    unsigned ste_needs_class_closure : 1; /* for class scopes, true if a
+                                             closure over __class__
+                                             should be created */
+    unsigned ste_needs_classdict : 1; /* for class scopes, true if a closure
+                                         over the class dict should be created */
+    unsigned ste_comp_inlined : 1; /* true if this comprehension is inlined */
+    unsigned ste_comp_iter_target : 1; /* true if visiting comprehension target */
+    unsigned ste_can_see_class_scope : 1; /* true if this block can see names bound in an
+                                             enclosing class scope */
+    int ste_comp_iter_expr; /* non-zero if visiting a comprehension range expression */
+    int ste_lineno;          /* first line of block */
+    int ste_col_offset;      /* offset of first line of block */
+    int ste_end_lineno;      /* end line of block */
+    int ste_end_col_offset;  /* end offset of first line of block */
+    int ste_opt_lineno;      /* lineno of last exec or import * */
+    int ste_opt_col_offset;  /* offset of last exec or import * */
+    struct symtable *ste_table;
+} PySTEntryObject;
+
+extern PyTypeObject PySTEntry_Type;
+
+#define PySTEntry_Check(op) Py_IS_TYPE((op), &PySTEntry_Type)
+
+extern long _PyST_GetSymbol(PySTEntryObject *, PyObject *);
+extern int _PyST_GetScope(PySTEntryObject *, PyObject *);
+extern int _PyST_IsFunctionLike(PySTEntryObject *);
+
+extern struct symtable* _PySymtable_Build(
+    struct _mod *mod,
+    PyObject *filename,
+    _PyFutureFeatures *future);
+extern PySTEntryObject* _PySymtable_Lookup(struct symtable *, void *);
+
+extern void _PySymtable_Free(struct symtable *);
+
+extern PyObject *_Py_MaybeMangle(PyObject *privateobj, PySTEntryObject *ste, PyObject *name);
+extern PyObject* _Py_Mangle(PyObject *p, PyObject *name);
+
+/* Flags for def-use information */
+
+#define DEF_GLOBAL 1             /* global stmt */
+#define DEF_LOCAL 2              /* assignment in code block */
+#define DEF_PARAM (2<<1)         /* formal parameter */
+#define DEF_NONLOCAL (2<<2)      /* nonlocal stmt */
+#define USE (2<<3)               /* name is used */
+#define DEF_FREE (2<<4)          /* name used but not defined in nested block */
+#define DEF_FREE_CLASS (2<<5)    /* free variable from class's method */
+#define DEF_IMPORT (2<<6)        /* assignment occurred via import */
+#define DEF_ANNOT (2<<7)         /* this name is annotated */
+#define DEF_COMP_ITER (2<<8)     /* this name is a comprehension iteration variable */
+#define DEF_TYPE_PARAM (2<<9)    /* this name is a type parameter */
+#define DEF_COMP_CELL (2<<10)    /* this name is a cell in an inlined comprehension */
+
+#define DEF_BOUND (DEF_LOCAL | DEF_PARAM | DEF_IMPORT)
+
+/* GLOBAL_EXPLICIT and GLOBAL_IMPLICIT are used internally by the symbol
+   table.  GLOBAL is returned from PyST_GetScope() for either of them.
+   It is stored in ste_symbols at bits 13-16.
+*/
+#define SCOPE_OFFSET 12
+#define SCOPE_MASK (DEF_GLOBAL | DEF_LOCAL | DEF_PARAM | DEF_NONLOCAL)
+
+#define LOCAL 1
+#define GLOBAL_EXPLICIT 2
+#define GLOBAL_IMPLICIT 3
+#define FREE 4
+#define CELL 5
+
+#define GENERATOR 1
+#define GENERATOR_EXPRESSION 2
+
+// Used by symtablemodule.c
+extern struct symtable* _Py_SymtableStringObjectFlags(
+    const char *str,
+    PyObject *filename,
+    int start,
+    PyCompilerFlags *flags);
+
+int _PyFuture_FromAST(
+    struct _mod * mod,
+    PyObject *filename,
+    _PyFutureFeatures* futures);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_SYMTABLE_H */
diff --git a/Include/internal/pycore_sysmodule.h b/Include/internal/pycore_sysmodule.h
new file mode 100644
index 0000000000000000000000000000000000000000..6df574487bcd1b28b05913403dd9a05a172354ca
--- /dev/null
+++ b/Include/internal/pycore_sysmodule.h
@@ -0,0 +1,38 @@
+#ifndef Py_INTERNAL_SYSMODULE_H
+#define Py_INTERNAL_SYSMODULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+PyAPI_FUNC(PyObject *) _PySys_GetAttr(PyThreadState *, PyObject *); /* unused */
+PyAPI_FUNC(int) _PySys_GetOptionalAttr(PyObject *, PyObject **);
+PyAPI_FUNC(int) _PySys_GetOptionalAttrString(const char *, PyObject **);
+PyAPI_FUNC(PyObject *) _PySys_GetRequiredAttr(PyObject *);
+PyAPI_FUNC(PyObject *) _PySys_GetRequiredAttrString(const char *);
+
+// Export for '_pickle' shared extension
+PyAPI_FUNC(size_t) _PySys_GetSizeOf(PyObject *);
+
+extern int _PySys_Audit(
+    PyThreadState *tstate,
+    const char *event,
+    const char *argFormat,
+    ...);
+
+// _PySys_ClearAuditHooks() must not be exported: use extern rather than
+// PyAPI_FUNC(). We want minimal exposure of this function.
+extern void _PySys_ClearAuditHooks(PyThreadState *tstate);
+
+extern int _PySys_SetAttr(PyObject *, PyObject *);
+
+extern int _PySys_ClearAttrString(PyInterpreterState *interp,
+                                  const char *name, int verbose);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_SYSMODULE_H */
diff --git a/Include/internal/pycore_time.h b/Include/internal/pycore_time.h
new file mode 100644
index 0000000000000000000000000000000000000000..205ac5d3781ddd6c6b70ae4f86c8901a1b064737
--- /dev/null
+++ b/Include/internal/pycore_time.h
@@ -0,0 +1,337 @@
+// Internal PyTime_t C API: see Doc/c-api/time.rst for the documentation.
+//
+// The PyTime_t type is an integer to support directly common arithmetic
+// operations such as t1 + t2.
+//
+// Time formats:
+//
+// * Seconds.
+// * Seconds as a floating-point number (C double).
+// * Milliseconds (10^-3 seconds).
+// * Microseconds (10^-6 seconds).
+// * 100 nanoseconds (10^-7 seconds), used on Windows.
+// * Nanoseconds (10^-9 seconds).
+// * timeval structure, 1 microsecond (10^-6 seconds).
+// * timespec structure, 1 nanosecond (10^-9 seconds).
+//
+// Note that PyTime_t is now specified as int64_t, in nanoseconds.
+// (If we need to change this, we'll need new public API with new names.)
+// Previously, PyTime_t was configurable (in theory); some comments and code
+// might still allude to that.
+//
+// Integer overflows are detected and raise OverflowError. Conversion to a
+// resolution larger than 1 nanosecond is rounded correctly with the requested
+// rounding mode. Available rounding modes:
+//
+// * Round towards minus infinity (-inf). For example, used to read a clock.
+// * Round towards infinity (+inf). For example, used for timeout to wait "at
+//   least" N seconds.
+// * Round to nearest with ties going to nearest even integer. For example, used
+//   to round from a Python float.
+// * Round away from zero. For example, used for timeout.
+//
+// Some functions clamp the result in the range [PyTime_MIN; PyTime_MAX]. The
+// caller doesn't have to handle errors and so doesn't need to hold the GIL to
+// handle exceptions. For example, _PyTime_Add(t1, t2) computes t1+t2 and
+// clamps the result on overflow.
+//
+// Clocks:
+//
+// * System clock
+// * Monotonic clock
+// * Performance counter
+//
+// Internally, operations like (t * k / q) with integers are implemented in a
+// way to reduce the risk of integer overflow. Such operation is used to convert a
+// clock value expressed in ticks with a frequency to PyTime_t, like
+// QueryPerformanceCounter() with QueryPerformanceFrequency() on Windows.
+
+
+#ifndef Py_INTERNAL_TIME_H
+#define Py_INTERNAL_TIME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+#ifdef __clang__
+struct timeval;
+#endif
+
+#define _SIZEOF_PYTIME_T 8
+
+typedef enum {
+    // Round towards minus infinity (-inf).
+    // For example, used to read a clock.
+    _PyTime_ROUND_FLOOR=0,
+
+    // Round towards infinity (+inf).
+    // For example, used for timeout to wait "at least" N seconds.
+    _PyTime_ROUND_CEILING=1,
+
+    // Round to nearest with ties going to nearest even integer.
+    // For example, used to round from a Python float.
+    _PyTime_ROUND_HALF_EVEN=2,
+
+    // Round away from zero
+    // For example, used for timeout. _PyTime_ROUND_CEILING rounds
+    // -1e-9 to 0 milliseconds which causes bpo-31786 issue.
+    // _PyTime_ROUND_UP rounds -1e-9 to -1 millisecond which keeps
+    // the timeout sign as expected. select.poll(timeout) must block
+    // for negative values.
+    _PyTime_ROUND_UP=3,
+
+    // _PyTime_ROUND_TIMEOUT (an alias for _PyTime_ROUND_UP) should be
+    // used for timeouts.
+    _PyTime_ROUND_TIMEOUT = _PyTime_ROUND_UP
+} _PyTime_round_t;
+
+
+// Convert a time_t to a PyLong.
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(PyObject*) _PyLong_FromTime_t(time_t sec);
+
+// Convert a PyLong to a time_t.
+// Export for '_datetime' shared extension
+PyAPI_FUNC(time_t) _PyLong_AsTime_t(PyObject *obj);
+
+// Convert a number of seconds, int or float, to time_t.
+// Export for '_datetime' shared extension.
+PyAPI_FUNC(int) _PyTime_ObjectToTime_t(
+    PyObject *obj,
+    time_t *sec,
+    _PyTime_round_t);
+
+// Convert a number of seconds, int or float, to a timeval structure.
+// usec is in the range [0; 999999] and rounded towards zero.
+// For example, -1.2 is converted to (-2, 800000).
+// Export for '_datetime' shared extension.
+PyAPI_FUNC(int) _PyTime_ObjectToTimeval(
+    PyObject *obj,
+    time_t *sec,
+    long *usec,
+    _PyTime_round_t);
+
+// Convert a number of seconds, int or float, to a timespec structure.
+// nsec is in the range [0; 999999999] and rounded towards zero.
+// For example, -1.2 is converted to (-2, 800000000).
+// Export for '_testinternalcapi' shared extension.
+PyAPI_FUNC(int) _PyTime_ObjectToTimespec(
+    PyObject *obj,
+    time_t *sec,
+    long *nsec,
+    _PyTime_round_t);
+
+
+// Create a timestamp from a number of seconds.
+// Export for '_socket' shared extension.
+PyAPI_FUNC(PyTime_t) _PyTime_FromSeconds(int seconds);
+
+// Create a timestamp from a number of seconds in double.
+extern int _PyTime_FromSecondsDouble(
+    double seconds,
+    _PyTime_round_t round,
+    PyTime_t *result);
+
+// Macro to create a timestamp from a number of seconds, no integer overflow.
+// Only use the macro for small values, prefer _PyTime_FromSeconds().
+#define _PYTIME_FROMSECONDS(seconds) \
+            ((PyTime_t)(seconds) * (1000 * 1000 * 1000))
+
+// Create a timestamp from a number of microseconds.
+// Clamp to [PyTime_MIN; PyTime_MAX] on overflow.
+extern PyTime_t _PyTime_FromMicrosecondsClamp(PyTime_t us);
+
+// Create a timestamp from a Python int object (number of nanoseconds).
+// Export for '_lsprof' shared extension.
+PyAPI_FUNC(int) _PyTime_FromLong(PyTime_t *t,
+    PyObject *obj);
+
+// Convert a number of seconds (Python float or int) to a timestamp.
+// Raise an exception and return -1 on error, return 0 on success.
+// Export for '_socket' shared extension.
+PyAPI_FUNC(int) _PyTime_FromSecondsObject(PyTime_t *t,
+    PyObject *obj,
+    _PyTime_round_t round);
+
+// Convert a number of milliseconds (Python float or int, 10^-3) to a timestamp.
+// Raise an exception and return -1 on error, return 0 on success.
+// Export for 'select' shared extension.
+PyAPI_FUNC(int) _PyTime_FromMillisecondsObject(PyTime_t *t,
+    PyObject *obj,
+    _PyTime_round_t round);
+
+// Convert timestamp to a number of milliseconds (10^-3 seconds).
+// Export for '_ssl' shared extension.
+PyAPI_FUNC(PyTime_t) _PyTime_AsMilliseconds(PyTime_t t,
+    _PyTime_round_t round);
+
+// Convert timestamp to a number of microseconds (10^-6 seconds).
+// Export for '_queue' shared extension.
+PyAPI_FUNC(PyTime_t) _PyTime_AsMicroseconds(PyTime_t t,
+    _PyTime_round_t round);
+
+#ifdef MS_WINDOWS
+// Convert timestamp to a number of 100 nanoseconds (10^-7 seconds).
+extern PyTime_t _PyTime_As100Nanoseconds(PyTime_t t,
+    _PyTime_round_t round);
+#endif
+
+// Convert a timestamp (number of nanoseconds) as a Python int object.
+// Export for '_testinternalcapi' shared extension.
+PyAPI_FUNC(PyObject*) _PyTime_AsLong(PyTime_t t);
+
+#ifndef MS_WINDOWS
+// Create a timestamp from a timeval structure.
+// Raise an exception and return -1 on overflow, return 0 on success.
+extern int _PyTime_FromTimeval(PyTime_t *tp, struct timeval *tv);
+#endif
+
+// Convert a timestamp to a timeval structure (microsecond resolution).
+// tv_usec is always positive.
+// Raise an exception and return -1 if the conversion overflowed,
+// return 0 on success.
+// Export for 'select' shared extension.
+PyAPI_FUNC(int) _PyTime_AsTimeval(PyTime_t t,
+    struct timeval *tv,
+    _PyTime_round_t round);
+
+// Similar to _PyTime_AsTimeval() but don't raise an exception on overflow.
+// On overflow, clamp tv_sec to PyTime_t min/max.
+// Export for 'select' shared extension.
+PyAPI_FUNC(void) _PyTime_AsTimeval_clamp(PyTime_t t,
+    struct timeval *tv,
+    _PyTime_round_t round);
+
+// Convert a timestamp to a number of seconds (secs) and microseconds (us).
+// us is always positive. This function is similar to _PyTime_AsTimeval()
+// except that secs is always a time_t type, whereas the timeval structure
+// uses a C long for tv_sec on Windows.
+// Raise an exception and return -1 if the conversion overflowed,
+// return 0 on success.
+// Export for '_datetime' shared extension.
+PyAPI_FUNC(int) _PyTime_AsTimevalTime_t(
+    PyTime_t t,
+    time_t *secs,
+    int *us,
+    _PyTime_round_t round);
+
+#if defined(HAVE_CLOCK_GETTIME) || defined(HAVE_KQUEUE)
+// Create a timestamp from a timespec structure.
+// Raise an exception and return -1 on overflow, return 0 on success.
+extern int _PyTime_FromTimespec(PyTime_t *tp, const struct timespec *ts);
+
+// Convert a timestamp to a timespec structure (nanosecond resolution).
+// tv_nsec is always positive.
+// Raise an exception and return -1 on error, return 0 on success.
+// Export for '_testinternalcapi' shared extension.
+PyAPI_FUNC(int) _PyTime_AsTimespec(PyTime_t t, struct timespec *ts);
+
+// Similar to _PyTime_AsTimespec() but don't raise an exception on overflow.
+// On overflow, clamp tv_sec to PyTime_t min/max.
+// Export for '_testinternalcapi' shared extension.
+PyAPI_FUNC(void) _PyTime_AsTimespec_clamp(PyTime_t t, struct timespec *ts);
+#endif
+
+
+// Compute t1 + t2. Clamp to [PyTime_MIN; PyTime_MAX] on overflow.
+extern PyTime_t _PyTime_Add(PyTime_t t1, PyTime_t t2);
+
+// Structure used by time.get_clock_info()
+typedef struct {
+    const char *implementation;
+    int monotonic;
+    int adjustable;
+    double resolution;
+} _Py_clock_info_t;
+
+// Get the current time from the system clock.
+// On success, set *t and *info (if not NULL), and return 0.
+// On error, raise an exception and return -1.
+extern int _PyTime_TimeWithInfo(
+    PyTime_t *t,
+    _Py_clock_info_t *info);
+
+// Get the time of a monotonic clock, i.e. a clock that cannot go backwards.
+// The clock is not affected by system clock updates. The reference point of
+// the returned value is undefined, so that only the difference between the
+// results of consecutive calls is valid.
+//
+// Fill info (if set) with information of the function used to get the time.
+//
+// Return 0 on success, raise an exception and return -1 on error.
+// Export for '_testsinglephase' shared extension.
+PyAPI_FUNC(int) _PyTime_MonotonicWithInfo(
+    PyTime_t *t,
+    _Py_clock_info_t *info);
+
+
+// Converts a timestamp to the Gregorian time, using the local time zone.
+// Return 0 on success, raise an exception and return -1 on error.
+// Export for '_datetime' shared extension.
+PyAPI_FUNC(int) _PyTime_localtime(time_t t, struct tm *tm);
+
+// Converts a timestamp to the Gregorian time, assuming UTC.
+// Return 0 on success, raise an exception and return -1 on error.
+// Export for '_datetime' shared extension.
+PyAPI_FUNC(int) _PyTime_gmtime(time_t t, struct tm *tm);
+
+
+// Get the performance counter: clock with the highest available resolution to
+// measure a short duration.
+//
+// Fill info (if set) with information of the function used to get the time.
+//
+// Return 0 on success, raise an exception and return -1 on error.
+extern int _PyTime_PerfCounterWithInfo(
+    PyTime_t *t,
+    _Py_clock_info_t *info);
+
+
+// --- _PyDeadline -----------------------------------------------------------
+
+// Create a deadline.
+// Pseudo code: return PyTime_MonotonicRaw() + timeout
+// Export for '_ssl' shared extension.
+PyAPI_FUNC(PyTime_t) _PyDeadline_Init(PyTime_t timeout);
+
+// Get remaining time from a deadline.
+// Pseudo code: return deadline - PyTime_MonotonicRaw()
+// Export for '_ssl' shared extension.
+PyAPI_FUNC(PyTime_t) _PyDeadline_Get(PyTime_t deadline);
+
+
+// --- _PyTimeFraction -------------------------------------------------------
+
+typedef struct {
+    PyTime_t numer;
+    PyTime_t denom;
+} _PyTimeFraction;
+
+// Set a fraction.
+// Return 0 on success.
+// Return -1 if the fraction is invalid.
+extern int _PyTimeFraction_Set(
+    _PyTimeFraction *frac,
+    PyTime_t numer,
+    PyTime_t denom);
+
+// Compute ticks * frac.numer / frac.denom.
+// Clamp to [PyTime_MIN; PyTime_MAX] on overflow.
+extern PyTime_t _PyTimeFraction_Mul(
+    PyTime_t ticks,
+    const _PyTimeFraction *frac);
+
+// Compute a clock resolution: frac.numer / frac.denom / 1e9.
+extern double _PyTimeFraction_Resolution(
+    const _PyTimeFraction *frac);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_INTERNAL_TIME_H
diff --git a/Include/internal/pycore_token.h b/Include/internal/pycore_token.h
new file mode 100644
index 0000000000000000000000000000000000000000..571cd6249f28126a573667ea41b0d52138aa9cff
--- /dev/null
+++ b/Include/internal/pycore_token.h
@@ -0,0 +1,106 @@
+// Auto-generated by Tools/build/generate_token.py
+
+/* Token types */
+#ifndef Py_INTERNAL_TOKEN_H
+#define Py_INTERNAL_TOKEN_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#undef TILDE   /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
+
+#define ENDMARKER       0
+#define NAME            1
+#define NUMBER          2
+#define STRING          3
+#define NEWLINE         4
+#define INDENT          5
+#define DEDENT          6
+#define LPAR            7
+#define RPAR            8
+#define LSQB            9
+#define RSQB            10
+#define COLON           11
+#define COMMA           12
+#define SEMI            13
+#define PLUS            14
+#define MINUS           15
+#define STAR            16
+#define SLASH           17
+#define VBAR            18
+#define AMPER           19
+#define LESS            20
+#define GREATER         21
+#define EQUAL           22
+#define DOT             23
+#define PERCENT         24
+#define LBRACE          25
+#define RBRACE          26
+#define EQEQUAL         27
+#define NOTEQUAL        28
+#define LESSEQUAL       29
+#define GREATEREQUAL    30
+#define TILDE           31
+#define CIRCUMFLEX      32
+#define LEFTSHIFT       33
+#define RIGHTSHIFT      34
+#define DOUBLESTAR      35
+#define PLUSEQUAL       36
+#define MINEQUAL        37
+#define STAREQUAL       38
+#define SLASHEQUAL      39
+#define PERCENTEQUAL    40
+#define AMPEREQUAL      41
+#define VBAREQUAL       42
+#define CIRCUMFLEXEQUAL 43
+#define LEFTSHIFTEQUAL  44
+#define RIGHTSHIFTEQUAL 45
+#define DOUBLESTAREQUAL 46
+#define DOUBLESLASH     47
+#define DOUBLESLASHEQUAL 48
+#define AT              49
+#define ATEQUAL         50
+#define RARROW          51
+#define ELLIPSIS        52
+#define COLONEQUAL      53
+#define EXCLAMATION     54
+#define OP              55
+#define TYPE_IGNORE     56
+#define TYPE_COMMENT    57
+#define SOFT_KEYWORD    58
+#define FSTRING_START   59
+#define FSTRING_MIDDLE  60
+#define FSTRING_END     61
+#define COMMENT         62
+#define NL              63
+#define ERRORTOKEN      64
+#define N_TOKENS        66
+#define NT_OFFSET       256
+
+/* Special definitions for cooperation with parser */
+
+#define ISTERMINAL(x)           ((x) < NT_OFFSET)
+#define ISNONTERMINAL(x)        ((x) >= NT_OFFSET)
+#define ISEOF(x)                ((x) == ENDMARKER)
+#define ISWHITESPACE(x)         ((x) == ENDMARKER || \
+                                 (x) == NEWLINE   || \
+                                 (x) == INDENT    || \
+                                 (x) == DEDENT)
+#define ISSTRINGLIT(x)          ((x) == STRING           || \
+                                 (x) == FSTRING_MIDDLE)
+
+
+// Export these 4 symbols for 'test_peg_generator'
+PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
+PyAPI_FUNC(int) _PyToken_OneChar(int);
+PyAPI_FUNC(int) _PyToken_TwoChars(int, int);
+PyAPI_FUNC(int) _PyToken_ThreeChars(int, int, int);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_TOKEN_H
diff --git a/Include/internal/pycore_traceback.h b/Include/internal/pycore_traceback.h
new file mode 100644
index 0000000000000000000000000000000000000000..10922bff98bd4bc0060189833d05b59ac4dc649f
--- /dev/null
+++ b/Include/internal/pycore_traceback.h
@@ -0,0 +1,106 @@
+#ifndef Py_INTERNAL_TRACEBACK_H
+#define Py_INTERNAL_TRACEBACK_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Export for '_ctypes' shared extension
+PyAPI_FUNC(int) _Py_DisplaySourceLine(PyObject *, PyObject *, int, int, int *, PyObject **);
+
+// Export for 'pyexact' shared extension
+PyAPI_FUNC(void) _PyTraceback_Add(const char *, const char *, int);
+
+/* Write the Python traceback into the file 'fd'. For example:
+
+       Traceback (most recent call first):
+         File "xxx", line xxx in <xxx>
+         File "xxx", line xxx in <xxx>
+         ...
+         File "xxx", line xxx in <xxx>
+
+   This function is written for debug purpose only, to dump the traceback in
+   the worst case: after a segmentation fault, at fatal error, etc. That's why,
+   it is very limited. Strings are truncated to 100 characters and encoded to
+   ASCII with backslashreplace. It doesn't write the source code, only the
+   function name, filename and line number of each frame. Write only the first
+   100 frames: if the traceback is truncated, write the line " ...".
+
+   This function is signal safe. */
+
+extern void _Py_DumpTraceback(
+    int fd,
+    PyThreadState *tstate);
+
+/* Write the traceback of all threads into the file 'fd'. current_thread can be
+   NULL.
+
+   Return NULL on success, or an error message on error.
+
+   This function is written for debug purpose only. It calls
+   _Py_DumpTraceback() for each thread, and so has the same limitations. It
+   only write the traceback of the first 100 threads: write "..." if there are
+   more threads.
+
+   If current_tstate is NULL, the function tries to get the Python thread state
+   of the current thread. It is not an error if the function is unable to get
+   the current Python thread state.
+
+   If interp is NULL, the function tries to get the interpreter state from
+   the current Python thread state, or from
+   _PyGILState_GetInterpreterStateUnsafe() in last resort.
+
+   It is better to pass NULL to interp and current_tstate, the function tries
+   different options to retrieve this information.
+
+   This function is signal safe. */
+
+extern const char* _Py_DumpTracebackThreads(
+    int fd,
+    PyInterpreterState *interp,
+    PyThreadState *current_tstate);
+
+/* Write a Unicode object into the file descriptor fd. Encode the string to
+   ASCII using the backslashreplace error handler.
+
+   Do nothing if text is not a Unicode object. The function accepts Unicode
+   string which is not ready (PyUnicode_WCHAR_KIND).
+
+   This function is signal safe. */
+extern void _Py_DumpASCII(int fd, PyObject *text);
+
+/* Format an integer as decimal into the file descriptor fd.
+
+   This function is signal safe. */
+extern void _Py_DumpDecimal(
+    int fd,
+    size_t value);
+
+/* Format an integer as hexadecimal with width digits into fd file descriptor.
+   The function is signal safe. */
+extern void _Py_DumpHexadecimal(
+    int fd,
+    uintptr_t value,
+    Py_ssize_t width);
+
+extern PyObject* _PyTraceBack_FromFrame(
+    PyObject *tb_next,
+    PyFrameObject *frame);
+
+#define EXCEPTION_TB_HEADER "Traceback (most recent call last):\n"
+#define EXCEPTION_GROUP_TB_HEADER "Exception Group Traceback (most recent call last):\n"
+
+/* Write the traceback tb to file f. Prefix each line with
+   indent spaces followed by the margin (if it is not NULL). */
+extern int _PyTraceBack_Print(
+    PyObject *tb, const char *header, PyObject *f);
+extern int _Py_WriteIndentedMargin(int, const char*, PyObject *);
+extern int _Py_WriteIndent(int, PyObject *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_TRACEBACK_H */
diff --git a/Include/internal/pycore_tracemalloc.h b/Include/internal/pycore_tracemalloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..f70d47074f813c630c4017788a69299d744fbba4
--- /dev/null
+++ b/Include/internal/pycore_tracemalloc.h
@@ -0,0 +1,170 @@
+#ifndef Py_INTERNAL_TRACEMALLOC_H
+#define Py_INTERNAL_TRACEMALLOC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_hashtable.h"     // _Py_hashtable_t
+
+
+/* Trace memory blocks allocated by PyMem_RawMalloc() */
+#define TRACE_RAW_MALLOC
+
+
+struct _PyTraceMalloc_Config {
+    /* Module initialized?
+       Variable protected by the GIL */
+    enum {
+        TRACEMALLOC_NOT_INITIALIZED,
+        TRACEMALLOC_INITIALIZED,
+        TRACEMALLOC_FINALIZED
+    } initialized;
+
+    /* Is tracemalloc tracing memory allocations?
+       Variable protected by the GIL */
+    int tracing;
+
+    /* limit of the number of frames in a traceback, 1 by default.
+       Variable protected by the GIL. */
+    int max_nframe;
+};
+
+
+/* Pack the frame_t structure to reduce the memory footprint on 64-bit
+   architectures: 12 bytes instead of 16. */
+#if defined(_MSC_VER)
+#pragma pack(push, 4)
+#endif
+
+struct
+#ifdef __GNUC__
+__attribute__((packed))
+#endif
+tracemalloc_frame {
+    /* filename cannot be NULL: "<unknown>" is used if the Python frame
+       filename is NULL */
+    PyObject *filename;
+    unsigned int lineno;
+};
+#ifdef _MSC_VER
+#pragma pack(pop)
+#endif
+
+struct tracemalloc_traceback {
+    Py_uhash_t hash;
+    /* Number of frames stored */
+    uint16_t nframe;
+    /* Total number of frames the traceback had */
+    uint16_t total_nframe;
+    struct tracemalloc_frame frames[1];
+};
+
+
+struct _tracemalloc_runtime_state {
+    struct _PyTraceMalloc_Config config;
+
+    /* Protected by the GIL */
+    struct {
+        PyMemAllocatorEx mem;
+        PyMemAllocatorEx raw;
+        PyMemAllocatorEx obj;
+    } allocators;
+
+#if defined(TRACE_RAW_MALLOC)
+    PyThread_type_lock tables_lock;
+#endif
+    /* Size in bytes of currently traced memory.
+       Protected by TABLES_LOCK(). */
+    size_t traced_memory;
+    /* Peak size in bytes of traced memory.
+       Protected by TABLES_LOCK(). */
+    size_t peak_traced_memory;
+    /* Hash table used as a set to intern filenames:
+       PyObject* => PyObject*.
+       Protected by the GIL */
+    _Py_hashtable_t *filenames;
+    /* Buffer to store a new traceback in traceback_new().
+       Protected by the GIL. */
+    struct tracemalloc_traceback *traceback;
+    /* Hash table used as a set to intern tracebacks:
+       traceback_t* => traceback_t*
+       Protected by the GIL */
+    _Py_hashtable_t *tracebacks;
+    /* pointer (void*) => trace (trace_t*).
+       Protected by TABLES_LOCK(). */
+    _Py_hashtable_t *traces;
+    /* domain (unsigned int) => traces (_Py_hashtable_t).
+       Protected by TABLES_LOCK(). */
+    _Py_hashtable_t *domains;
+
+    struct tracemalloc_traceback empty_traceback;
+
+    Py_tss_t reentrant_key;
+};
+
+#define _tracemalloc_runtime_state_INIT \
+    { \
+        .config = { \
+            .initialized = TRACEMALLOC_NOT_INITIALIZED, \
+            .tracing = 0, \
+            .max_nframe = 1, \
+        }, \
+        .reentrant_key = Py_tss_NEEDS_INIT, \
+    }
+
+
+// Get the traceback where a memory block was allocated.
+//
+// Return a tuple of (filename: str, lineno: int) tuples.
+//
+// Return None if the tracemalloc module is disabled or if the memory block
+// is not tracked by tracemalloc.
+//
+// Raise an exception and return NULL on error.
+//
+// Export for '_testinternalcapi' shared extension.
+PyAPI_FUNC(PyObject*) _PyTraceMalloc_GetTraceback(
+    unsigned int domain,
+    uintptr_t ptr);
+
+/* Return non-zero if tracemalloc is tracing */
+extern int _PyTraceMalloc_IsTracing(void);
+
+/* Clear the tracemalloc traces */
+extern void _PyTraceMalloc_ClearTraces(void);
+
+/* Clear the tracemalloc traces */
+extern PyObject* _PyTraceMalloc_GetTraces(void);
+
+/* Clear tracemalloc traceback for an object */
+extern PyObject* _PyTraceMalloc_GetObjectTraceback(PyObject *obj);
+
+/* Initialize tracemalloc */
+extern PyStatus _PyTraceMalloc_Init(void);
+
+/* Start tracemalloc */
+extern int _PyTraceMalloc_Start(int max_nframe);
+
+/* Stop tracemalloc */
+extern void _PyTraceMalloc_Stop(void);
+
+/* Get the tracemalloc traceback limit */
+extern int _PyTraceMalloc_GetTracebackLimit(void);
+
+/* Get the memory usage of tracemalloc in bytes */
+extern size_t _PyTraceMalloc_GetMemory(void);
+
+/* Get the current size and peak size of traced memory blocks as a 2-tuple */
+extern PyObject* _PyTraceMalloc_GetTracedMemory(void);
+
+/* Set the peak size of traced memory blocks to the current size */
+extern void _PyTraceMalloc_ResetPeak(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_TRACEMALLOC_H
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ed5b1d826aaa4a4b047017ffc3fc26300f9d12e
--- /dev/null
+++ b/Include/internal/pycore_tstate.h
@@ -0,0 +1,46 @@
+#ifndef Py_INTERNAL_TSTATE_H
+#define Py_INTERNAL_TSTATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_brc.h"           // struct _brc_thread_state
+#include "pycore_freelist.h"      // struct _Py_freelist_state
+#include "pycore_mimalloc.h"      // struct _mimalloc_thread_state
+#include "pycore_qsbr.h"          // struct qsbr
+
+
+// Every PyThreadState is actually allocated as a _PyThreadStateImpl. The
+// PyThreadState fields are exposed as part of the C API, although most fields
+// are intended to be private. The _PyThreadStateImpl fields not exposed.
+typedef struct _PyThreadStateImpl {
+    // semi-public fields are in PyThreadState.
+    PyThreadState base;
+
+    PyObject *asyncio_running_loop; // Strong reference
+
+    struct _qsbr_thread_state *qsbr;  // only used by free-threaded build
+    struct llist_node mem_free_queue; // delayed free queue
+
+#ifdef Py_GIL_DISABLED
+    struct _gc_thread_state gc;
+    struct _mimalloc_thread_state mimalloc;
+    struct _Py_object_freelists freelists;
+    struct _brc_thread_state brc;
+#endif
+
+#if defined(Py_REF_DEBUG) && defined(Py_GIL_DISABLED)
+    Py_ssize_t reftotal;  // this thread's total refcount operations
+#endif
+
+} _PyThreadStateImpl;
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_TSTATE_H */
diff --git a/Include/internal/pycore_tuple.h b/Include/internal/pycore_tuple.h
new file mode 100644
index 0000000000000000000000000000000000000000..14a9e42c3a324cd73bd0527fe5ba805f7a91d7b0
--- /dev/null
+++ b/Include/internal/pycore_tuple.h
@@ -0,0 +1,35 @@
+#ifndef Py_INTERNAL_TUPLE_H
+#define Py_INTERNAL_TUPLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern void _PyTuple_MaybeUntrack(PyObject *);
+extern void _PyTuple_DebugMallocStats(FILE *out);
+
+/* runtime lifecycle */
+
+extern PyStatus _PyTuple_InitGlobalObjects(PyInterpreterState *);
+
+
+/* other API */
+
+#define _PyTuple_ITEMS(op) _Py_RVALUE(_PyTuple_CAST(op)->ob_item)
+
+extern PyObject *_PyTuple_FromArray(PyObject *const *, Py_ssize_t);
+PyAPI_FUNC(PyObject *)_PyTuple_FromArraySteal(PyObject *const *, Py_ssize_t);
+
+typedef struct {
+    PyObject_HEAD
+    Py_ssize_t it_index;
+    PyTupleObject *it_seq; /* Set to NULL when iterator is exhausted */
+} _PyTupleIterObject;
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_TUPLE_H */
diff --git a/Include/internal/pycore_typeobject.h b/Include/internal/pycore_typeobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..164b243dae7806e2678536181f82d3a03f1df533
--- /dev/null
+++ b/Include/internal/pycore_typeobject.h
@@ -0,0 +1,245 @@
+#ifndef Py_INTERNAL_TYPEOBJECT_H
+#define Py_INTERNAL_TYPEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_moduleobject.h"  // PyModuleObject
+#include "pycore_lock.h"          // PyMutex
+
+
+/* state */
+
+#define _Py_TYPE_BASE_VERSION_TAG (2<<16)
+#define _Py_MAX_GLOBAL_TYPE_VERSION_TAG (_Py_TYPE_BASE_VERSION_TAG - 1)
+
+/* For now we hard-code this to a value for which we are confident
+   all the static builtin types will fit (for all builds). */
+#define _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES 200
+#define _Py_MAX_MANAGED_STATIC_EXT_TYPES 10
+#define _Py_MAX_MANAGED_STATIC_TYPES \
+    (_Py_MAX_MANAGED_STATIC_BUILTIN_TYPES + _Py_MAX_MANAGED_STATIC_EXT_TYPES)
+
+struct _types_runtime_state {
+    /* Used to set PyTypeObject.tp_version_tag for core static types. */
+    // bpo-42745: next_version_tag remains shared by all interpreters
+    // because of static types.
+    unsigned int next_version_tag;
+
+    struct {
+        struct {
+            PyTypeObject *type;
+            int64_t interp_count;
+        } types[_Py_MAX_MANAGED_STATIC_TYPES];
+    } managed_static;
+};
+
+
+// Type attribute lookup cache: speed up attribute and method lookups,
+// see _PyType_Lookup().
+struct type_cache_entry {
+    unsigned int version;  // initialized from type->tp_version_tag
+#ifdef Py_GIL_DISABLED
+   _PySeqLock sequence;
+#endif
+    PyObject *name;        // reference to exactly a str or None
+    PyObject *value;       // borrowed reference or NULL
+};
+
+#define MCACHE_SIZE_EXP 12
+
+struct type_cache {
+    struct type_cache_entry hashtable[1 << MCACHE_SIZE_EXP];
+};
+
+typedef struct {
+    PyTypeObject *type;
+    int isbuiltin;
+    int readying;
+    int ready;
+    // XXX tp_dict can probably be statically allocated,
+    // instead of dynamically and stored on the interpreter.
+    PyObject *tp_dict;
+    PyObject *tp_subclasses;
+    /* We never clean up weakrefs for static builtin types since
+       they will effectively never get triggered.  However, there
+       are also some diagnostic uses for the list of weakrefs,
+       so we still keep it. */
+    PyObject *tp_weaklist;
+} managed_static_type_state;
+
+struct types_state {
+    /* Used to set PyTypeObject.tp_version_tag.
+       It starts at _Py_MAX_GLOBAL_TYPE_VERSION_TAG + 1,
+       where all those lower numbers are used for core static types. */
+    unsigned int next_version_tag;
+
+    struct type_cache type_cache;
+
+    /* Every static builtin type is initialized for each interpreter
+       during its own initialization, including for the main interpreter
+       during global runtime initialization.  This is done by calling
+       _PyStaticType_InitBuiltin().
+
+       The first time a static builtin type is initialized, all the
+       normal PyType_Ready() stuff happens.  The only difference from
+       normal is that there are three PyTypeObject fields holding
+       objects which are stored here (on PyInterpreterState) rather
+       than in the corresponding PyTypeObject fields.  Those are:
+       tp_dict (cls.__dict__), tp_subclasses (cls.__subclasses__),
+       and tp_weaklist.
+
+       When a subinterpreter is initialized, each static builtin type
+       is still initialized, but only the interpreter-specific portion,
+       namely those three objects.
+
+       Those objects are stored in the PyInterpreterState.types.builtins
+       array, at the index corresponding to each specific static builtin
+       type.  That index (a size_t value) is stored in the tp_subclasses
+       field.  For static builtin types, we re-purposed the now-unused
+       tp_subclasses to avoid adding another field to PyTypeObject.
+       In all other cases tp_subclasses holds a dict like before.
+       (The field was previously defined as PyObject*, but is now void*
+       to reflect its dual use.)
+
+       The index for each static builtin type isn't statically assigned.
+       Instead it is calculated the first time a type is initialized
+       (by the main interpreter).  The index matches the order in which
+       the type was initialized relative to the others.  The actual
+       value comes from the current value of num_builtins_initialized,
+       as each type is initialized for the main interpreter.
+
+       num_builtins_initialized is incremented once for each static
+       builtin type.  Once initialization is over for a subinterpreter,
+       the value will be the same as for all other interpreters.  */
+    struct {
+        size_t num_initialized;
+        managed_static_type_state initialized[_Py_MAX_MANAGED_STATIC_BUILTIN_TYPES];
+    } builtins;
+    /* We apply a similar strategy for managed extension modules. */
+    struct {
+        size_t num_initialized;
+        size_t next_index;
+        managed_static_type_state initialized[_Py_MAX_MANAGED_STATIC_EXT_TYPES];
+    } for_extensions;
+    PyMutex mutex;
+};
+
+
+/* runtime lifecycle */
+
+extern PyStatus _PyTypes_InitTypes(PyInterpreterState *);
+extern void _PyTypes_FiniTypes(PyInterpreterState *);
+extern void _PyTypes_FiniExtTypes(PyInterpreterState *interp);
+extern void _PyTypes_Fini(PyInterpreterState *);
+extern void _PyTypes_AfterFork(void);
+
+/* other API */
+
+/* Length of array of slotdef pointers used to store slots with the
+   same __name__.  There should be at most MAX_EQUIV-1 slotdef entries with
+   the same __name__, for any __name__. Since that's a static property, it is
+   appropriate to declare fixed-size arrays for this. */
+#define MAX_EQUIV 10
+
+typedef struct wrapperbase pytype_slotdef;
+
+
+static inline PyObject **
+_PyStaticType_GET_WEAKREFS_LISTPTR(managed_static_type_state *state)
+{
+    assert(state != NULL);
+    return &state->tp_weaklist;
+}
+
+extern int _PyStaticType_InitBuiltin(
+    PyInterpreterState *interp,
+    PyTypeObject *type);
+extern void _PyStaticType_FiniBuiltin(
+    PyInterpreterState *interp,
+    PyTypeObject *type);
+extern void _PyStaticType_ClearWeakRefs(
+    PyInterpreterState *interp,
+    PyTypeObject *type);
+extern managed_static_type_state * _PyStaticType_GetState(
+    PyInterpreterState *interp,
+    PyTypeObject *type);
+
+// Export for '_datetime' shared extension.
+PyAPI_FUNC(int) _PyStaticType_InitForExtension(
+    PyInterpreterState *interp,
+     PyTypeObject *self);
+
+
+/* Like PyType_GetModuleState, but skips verification
+ * that type is a heap type with an associated module */
+static inline void *
+_PyType_GetModuleState(PyTypeObject *type)
+{
+    assert(PyType_Check(type));
+    assert(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
+    PyHeapTypeObject *et = (PyHeapTypeObject *)type;
+    assert(et->ht_module);
+    PyModuleObject *mod = (PyModuleObject *)(et->ht_module);
+    assert(mod != NULL);
+    return mod->md_state;
+}
+
+
+// Export for 'math' shared extension, used via _PyType_IsReady() static inline
+// function
+PyAPI_FUNC(PyObject *) _PyType_GetDict(PyTypeObject *);
+
+extern PyObject * _PyType_GetBases(PyTypeObject *type);
+extern PyObject * _PyType_GetMRO(PyTypeObject *type);
+extern PyObject* _PyType_GetSubclasses(PyTypeObject *);
+extern int _PyType_HasSubclasses(PyTypeObject *);
+PyAPI_FUNC(PyObject *) _PyType_GetModuleByDef2(PyTypeObject *, PyTypeObject *, PyModuleDef *);
+PyAPI_FUNC(PyObject *) _PyType_GetModuleByDef3(PyTypeObject *, PyTypeObject *, PyTypeObject *, PyModuleDef *);
+
+// PyType_Ready() must be called if _PyType_IsReady() is false.
+// See also the Py_TPFLAGS_READY flag.
+static inline int
+_PyType_IsReady(PyTypeObject *type)
+{
+    return _PyType_GetDict(type) != NULL;
+}
+
+extern PyObject* _Py_type_getattro_impl(PyTypeObject *type, PyObject *name,
+                                        int *suppress_missing_attribute);
+extern PyObject* _Py_type_getattro(PyObject *type, PyObject *name);
+
+extern PyObject* _Py_BaseObject_RichCompare(PyObject* self, PyObject* other, int op);
+
+extern PyObject* _Py_slot_tp_getattro(PyObject *self, PyObject *name);
+extern PyObject* _Py_slot_tp_getattr_hook(PyObject *self, PyObject *name);
+
+extern PyTypeObject _PyBufferWrapper_Type;
+
+PyAPI_FUNC(PyObject*) _PySuper_Lookup(PyTypeObject *su_type, PyObject *su_obj,
+                                 PyObject *name, int *meth_found);
+
+extern PyObject* _PyType_GetFullyQualifiedName(PyTypeObject *type, char sep);
+
+// Perform the following operation, in a thread-safe way when required by the
+// build mode.
+//
+// self->tp_flags = (self->tp_flags & ~mask) | flags;
+extern void _PyType_SetFlags(PyTypeObject *self, unsigned long mask,
+                             unsigned long flags);
+extern int _PyType_AddMethod(PyTypeObject *, PyMethodDef *);
+
+// Like _PyType_SetFlags(), but apply the operation to self and any of its
+// subclasses without Py_TPFLAGS_IMMUTABLETYPE set.
+extern void _PyType_SetFlagsRecursive(PyTypeObject *self, unsigned long mask,
+                                      unsigned long flags);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_TYPEOBJECT_H */
diff --git a/Include/internal/pycore_typevarobject.h b/Include/internal/pycore_typevarobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..a368edebd622a163b3bc9de07bff1e219fd85d29
--- /dev/null
+++ b/Include/internal/pycore_typevarobject.h
@@ -0,0 +1,27 @@
+#ifndef Py_INTERNAL_TYPEVAROBJECT_H
+#define Py_INTERNAL_TYPEVAROBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern PyObject *_Py_make_typevar(PyObject *, PyObject *, PyObject *);
+extern PyObject *_Py_make_paramspec(PyThreadState *, PyObject *);
+extern PyObject *_Py_make_typevartuple(PyThreadState *, PyObject *);
+extern PyObject *_Py_make_typealias(PyThreadState *, PyObject *);
+extern PyObject *_Py_subscript_generic(PyThreadState *, PyObject *);
+extern PyObject *_Py_set_typeparam_default(PyThreadState *, PyObject *, PyObject *);
+extern int _Py_initialize_generic(PyInterpreterState *);
+extern void _Py_clear_generic_types(PyInterpreterState *);
+
+extern PyTypeObject _PyTypeAlias_Type;
+extern PyTypeObject _PyNoDefault_Type;
+extern PyObject _Py_NoDefaultStruct;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_TYPEVAROBJECT_H */
diff --git a/Include/internal/pycore_ucnhash.h b/Include/internal/pycore_ucnhash.h
new file mode 100644
index 0000000000000000000000000000000000000000..1561dfbb3150d3fe3d61898c51701ef2831e8e2f
--- /dev/null
+++ b/Include/internal/pycore_ucnhash.h
@@ -0,0 +1,36 @@
+/* Unicode name database interface */
+#ifndef Py_INTERNAL_UCNHASH_H
+#define Py_INTERNAL_UCNHASH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* revised ucnhash CAPI interface (exported through a "wrapper") */
+
+#define PyUnicodeData_CAPSULE_NAME "unicodedata._ucnhash_CAPI"
+
+typedef struct {
+
+    /* Get name for a given character code.
+       Returns non-zero if success, zero if not.
+       Does not set Python exceptions. */
+    int (*getname)(Py_UCS4 code, char* buffer, int buflen,
+                   int with_alias_and_seq);
+
+    /* Get character code for a given name.
+       Same error handling as for getname(). */
+    int (*getcode)(const char* name, int namelen, Py_UCS4* code,
+                   int with_named_seq);
+
+} _PyUnicode_Name_CAPI;
+
+extern _PyUnicode_Name_CAPI* _PyUnicode_GetNameCAPI(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UCNHASH_H */
diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ebc7c120fc29dea743e0caea32d0ece54fc9898
--- /dev/null
+++ b/Include/internal/pycore_unicodeobject.h
@@ -0,0 +1,351 @@
+#ifndef Py_INTERNAL_UNICODEOBJECT_H
+#define Py_INTERNAL_UNICODEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_lock.h"          // PyMutex
+#include "pycore_fileutils.h"     // _Py_error_handler
+#include "pycore_identifier.h"    // _Py_Identifier
+#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
+#include "pycore_global_objects.h"  // _Py_SINGLETON
+
+/* --- Characters Type APIs ----------------------------------------------- */
+
+extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
+extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
+extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
+extern int _PyUnicode_IsCased(Py_UCS4 ch);
+
+/* --- Unicode API -------------------------------------------------------- */
+
+// Export for '_json' shared extension
+PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
+    PyObject *op,
+    int check_content);
+
+PyAPI_FUNC(void) _PyUnicode_ExactDealloc(PyObject *op);
+extern Py_ssize_t _PyUnicode_InternedSize(void);
+extern Py_ssize_t _PyUnicode_InternedSize_Immortal(void);
+
+// Get a copy of a Unicode string.
+// Export for '_datetime' shared extension.
+PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
+    PyObject *unicode);
+
+/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
+   if parameters are invalid (e.g. if length is longer than the string). */
+extern void _PyUnicode_FastFill(
+    PyObject *unicode,
+    Py_ssize_t start,
+    Py_ssize_t length,
+    Py_UCS4 fill_char
+    );
+
+/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
+   may crash if parameters are invalid (e.g. if the output string
+   is too short). */
+extern void _PyUnicode_FastCopyCharacters(
+    PyObject *to,
+    Py_ssize_t to_start,
+    PyObject *from,
+    Py_ssize_t from_start,
+    Py_ssize_t how_many
+    );
+
+/* Create a new string from a buffer of ASCII characters.
+   WARNING: Don't check if the string contains any non-ASCII character. */
+extern PyObject* _PyUnicode_FromASCII(
+    const char *buffer,
+    Py_ssize_t size);
+
+/* Compute the maximum character of the substring unicode[start:end].
+   Return 127 for an empty string. */
+extern Py_UCS4 _PyUnicode_FindMaxChar (
+    PyObject *unicode,
+    Py_ssize_t start,
+    Py_ssize_t end);
+
+/* --- _PyUnicodeWriter API ----------------------------------------------- */
+
+/* Format the object based on the format_spec, as defined in PEP 3101
+   (Advanced String Formatting). */
+extern int _PyUnicode_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
+
+/* --- UTF-7 Codecs ------------------------------------------------------- */
+
+extern PyObject* _PyUnicode_EncodeUTF7(
+    PyObject *unicode,          /* Unicode object */
+    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
+    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
+    const char *errors);        /* error handling */
+
+/* --- UTF-8 Codecs ------------------------------------------------------- */
+
+// Export for '_tkinter' shared extension.
+PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
+    PyObject *unicode,
+    const char *errors);
+
+/* --- UTF-32 Codecs ------------------------------------------------------ */
+
+// Export for '_tkinter' shared extension
+PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
+    PyObject *object,           /* Unicode object */
+    const char *errors,         /* error handling */
+    int byteorder);             /* byteorder to use 0=BOM+native;-1=LE,1=BE */
+
+/* --- UTF-16 Codecs ------------------------------------------------------ */
+
+// Returns a Python string object holding the UTF-16 encoded value of
+// the Unicode data.
+//
+// If byteorder is not 0, output is written according to the following
+// byte order:
+//
+// byteorder == -1: little endian
+// byteorder == 0:  native byte order (writes a BOM mark)
+// byteorder == 1:  big endian
+//
+// If byteorder is 0, the output string will always start with the
+// Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
+// prepended.
+//
+// Export for '_tkinter' shared extension
+PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
+    PyObject* unicode,          /* Unicode object */
+    const char *errors,         /* error handling */
+    int byteorder);             /* byteorder to use 0=BOM+native;-1=LE,1=BE */
+
+/* --- Unicode-Escape Codecs ---------------------------------------------- */
+
+/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
+extern PyObject* _PyUnicode_DecodeUnicodeEscapeStateful(
+    const char *string,     /* Unicode-Escape encoded string */
+    Py_ssize_t length,      /* size of string */
+    const char *errors,     /* error handling */
+    Py_ssize_t *consumed);  /* bytes consumed */
+
+// Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
+// chars.
+// Export for test_peg_generator.
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
+    const char *string,     /* Unicode-Escape encoded string */
+    Py_ssize_t length,      /* size of string */
+    const char *errors,     /* error handling */
+    Py_ssize_t *consumed,   /* bytes consumed */
+    int *first_invalid_escape_char, /* on return, if not -1, contain the first
+                                       invalid escaped char (<= 0xff) or invalid
+                                       octal escape (> 0xff) in string. */
+    const char **first_invalid_escape_ptr); /* on return, if not NULL, may
+                                        point to the first invalid escaped
+                                        char in string.
+                                        May be NULL if errors is not NULL. */
+// Export for binary compatibility.
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
+    const char *string,     /* Unicode-Escape encoded string */
+    Py_ssize_t length,      /* size of string */
+    const char *errors,     /* error handling */
+    Py_ssize_t *consumed,   /* bytes consumed */
+    const char **first_invalid_escape); /* on return, points to first
+                                           invalid escaped char in
+                                           string. */
+
+/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
+
+/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
+extern PyObject* _PyUnicode_DecodeRawUnicodeEscapeStateful(
+    const char *string,     /* Unicode-Escape encoded string */
+    Py_ssize_t length,      /* size of string */
+    const char *errors,     /* error handling */
+    Py_ssize_t *consumed);  /* bytes consumed */
+
+/* --- Latin-1 Codecs ----------------------------------------------------- */
+
+extern PyObject* _PyUnicode_AsLatin1String(
+    PyObject* unicode,
+    const char* errors);
+
+/* --- ASCII Codecs ------------------------------------------------------- */
+
+extern PyObject* _PyUnicode_AsASCIIString(
+    PyObject* unicode,
+    const char* errors);
+
+/* --- Character Map Codecs ----------------------------------------------- */
+
+/* Translate an Unicode object by applying a character mapping table to
+   it and return the resulting Unicode object.
+
+   The mapping table must map Unicode ordinal integers to Unicode strings,
+   Unicode ordinal integers or None (causing deletion of the character).
+
+   Mapping tables may be dictionaries or sequences. Unmapped character
+   ordinals (ones which cause a LookupError) are left untouched and
+   are copied as-is.
+*/
+extern PyObject* _PyUnicode_EncodeCharmap(
+    PyObject *unicode,          /* Unicode object */
+    PyObject *mapping,          /* encoding mapping */
+    const char *errors);        /* error handling */
+
+/* --- Decimal Encoder ---------------------------------------------------- */
+
+// Coverts a Unicode object holding a decimal value to an ASCII string
+// for using in int, float and complex parsers.
+// Transforms code points that have decimal digit property to the
+// corresponding ASCII digit code points.  Transforms spaces to ASCII.
+// Transforms code points starting from the first non-ASCII code point that
+// is neither a decimal digit nor a space to the end into '?'.
+//
+// Export for '_testinternalcapi' shared extension.
+PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
+    PyObject *unicode);         /* Unicode object */
+
+/* --- Methods & Slots ---------------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) _PyUnicode_JoinArray(
+    PyObject *separator,
+    PyObject *const *items,
+    Py_ssize_t seqlen
+    );
+
+/* Test whether a unicode is equal to ASCII identifier.  Return 1 if true,
+   0 otherwise.  The right argument must be ASCII identifier.
+   Any error occurs inside will be cleared before return. */
+extern int _PyUnicode_EqualToASCIIId(
+    PyObject *left,             /* Left string */
+    _Py_Identifier *right       /* Right identifier */
+    );
+
+// Test whether a unicode is equal to ASCII string.  Return 1 if true,
+// 0 otherwise.  The right argument must be ASCII-encoded string.
+// Any error occurs inside will be cleared before return.
+// Export for '_ctypes' shared extension
+PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
+    PyObject *left,
+    const char *right           /* ASCII-encoded string */
+    );
+
+/* Externally visible for str.strip(unicode) */
+extern PyObject* _PyUnicode_XStrip(
+    PyObject *self,
+    int striptype,
+    PyObject *sepobj
+    );
+
+
+/* Using explicit passed-in values, insert the thousands grouping
+   into the string pointed to by buffer.  For the argument descriptions,
+   see Objects/stringlib/localeutil.h */
+extern Py_ssize_t _PyUnicode_InsertThousandsGrouping(
+    _PyUnicodeWriter *writer,
+    Py_ssize_t n_buffer,
+    PyObject *digits,
+    Py_ssize_t d_pos,
+    Py_ssize_t n_digits,
+    Py_ssize_t min_width,
+    const char *grouping,
+    PyObject *thousands_sep,
+    Py_UCS4 *maxchar);
+
+/* --- Misc functions ----------------------------------------------------- */
+
+extern PyObject* _PyUnicode_FormatLong(PyObject *, int, int, int);
+
+/* Fast equality check when the inputs are known to be exact unicode types
+   and where the hash values are equal (i.e. a very probable match) */
+extern int _PyUnicode_EQ(PyObject *, PyObject *);
+
+// Equality check.
+// Export for '_pickle' shared extension.
+PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
+
+extern int _PyUnicode_WideCharString_Converter(PyObject *, void *);
+extern int _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
+
+// Export for test_peg_generator
+PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
+
+/* --- Runtime lifecycle -------------------------------------------------- */
+
+extern void _PyUnicode_InitState(PyInterpreterState *);
+extern PyStatus _PyUnicode_InitGlobalObjects(PyInterpreterState *);
+extern PyStatus _PyUnicode_InitTypes(PyInterpreterState *);
+extern void _PyUnicode_Fini(PyInterpreterState *);
+extern void _PyUnicode_FiniTypes(PyInterpreterState *);
+
+extern PyTypeObject _PyUnicodeASCIIIter_Type;
+
+/* --- Interning ---------------------------------------------------------- */
+
+// All these are "ref-neutral", like the public PyUnicode_InternInPlace.
+
+// Explicit interning routines:
+PyAPI_FUNC(void) _PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **);
+PyAPI_FUNC(void) _PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **);
+// Left here to help backporting:
+PyAPI_FUNC(void) _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p);
+// Only for singletons in the _PyRuntime struct:
+extern void _PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **);
+
+/* --- Other API ---------------------------------------------------------- */
+
+struct _Py_unicode_runtime_ids {
+    PyMutex mutex;
+    // next_index value must be preserved when Py_Initialize()/Py_Finalize()
+    // is called multiple times: see _PyUnicode_FromId() implementation.
+    Py_ssize_t next_index;
+};
+
+struct _Py_unicode_runtime_state {
+    struct _Py_unicode_runtime_ids ids;
+};
+
+/* fs_codec.encoding is initialized to NULL.
+   Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */
+struct _Py_unicode_fs_codec {
+    char *encoding;   // Filesystem encoding (encoded to UTF-8)
+    int utf8;         // encoding=="utf-8"?
+    char *errors;     // Filesystem errors (encoded to UTF-8)
+    _Py_error_handler error_handler;
+};
+
+struct _Py_unicode_ids {
+    Py_ssize_t size;
+    PyObject **array;
+};
+
+struct _Py_unicode_state {
+    struct _Py_unicode_fs_codec fs_codec;
+
+    _PyUnicode_Name_CAPI *ucnhash_capi;
+
+    // Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId()
+    struct _Py_unicode_ids ids;
+};
+
+extern void _PyUnicode_ClearInterned(PyInterpreterState *interp);
+
+// Like PyUnicode_AsUTF8(), but check for embedded null characters.
+// Export for '_sqlite3' shared extension.
+PyAPI_FUNC(const char *) _PyUnicode_AsUTF8NoNUL(PyObject *);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UNICODEOBJECT_H */
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f6b6e07984a9a9741ce9329cac07d7e301d0405
--- /dev/null
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -0,0 +1,2980 @@
+#ifndef Py_INTERNAL_UNICODEOBJECT_GENERATED_H
+#define Py_INTERNAL_UNICODEOBJECT_GENERATED_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* The following is auto-generated by Tools/build/generate_global_objects.py. */
+static inline void
+_PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
+    PyObject *string;
+    string = &_Py_ID(CANCELLED);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(FINISHED);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(False);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(JSONDecodeError);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(PENDING);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(Py_Repr);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(TextIOWrapper);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(True);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(WarningMessage);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_WindowsConsoleIO);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__IOBase_closed);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__abc_tpflags__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__abs__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__abstractmethods__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__add__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__aenter__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__aexit__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__aiter__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__all__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__and__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__anext__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__annotations__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__args__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__await__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__bases__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__bool__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__buffer__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__build_class__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__builtins__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__bytes__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__call__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__cantrace__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__class__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__class_getitem__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__classcell__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__classdict__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__classdictcell__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__complex__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__contains__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__copy__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__ctypes_from_outparam__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__del__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__delattr__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__delete__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__delitem__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__dict__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__dictoffset__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__dir__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__divmod__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__doc__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__enter__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__eq__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__exit__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__file__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__firstlineno__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__float__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__floordiv__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__format__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__fspath__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__ge__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__get__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__getattr__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__getattribute__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__getinitargs__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__getitem__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__getnewargs__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__getnewargs_ex__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__getstate__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__gt__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__hash__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__iadd__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__iand__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__ifloordiv__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__ilshift__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__imatmul__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__imod__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__import__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__imul__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__index__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__init__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__init_subclass__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__instancecheck__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__int__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__invert__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__ior__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__ipow__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__irshift__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__isabstractmethod__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__isub__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__iter__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__itruediv__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__ixor__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__le__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__len__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__length_hint__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__lltrace__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__loader__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__lshift__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__lt__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__main__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__match_args__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__matmul__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__missing__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__mod__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__module__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__mro_entries__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__mul__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__name__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__ne__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__neg__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__new__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__newobj__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__newobj_ex__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__next__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__notes__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__or__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__orig_class__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__origin__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__package__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__parameters__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__path__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__pos__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__pow__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__prepare__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__qualname__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__radd__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rand__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rdivmod__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__reduce__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__reduce_ex__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__release_buffer__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__repr__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__reversed__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rfloordiv__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rlshift__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rmatmul__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rmod__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rmul__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__ror__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__round__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rpow__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rrshift__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rshift__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rsub__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rtruediv__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__rxor__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__set__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__set_name__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__setattr__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__setitem__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__setstate__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__sizeof__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__slotnames__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__slots__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__spec__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__static_attributes__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__str__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__sub__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__subclasscheck__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__subclasshook__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__truediv__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__trunc__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__type_params__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__typing_is_unpacked_typevartuple__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__typing_prepare_subst__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__typing_subst__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__typing_unpacked_tuple_args__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__warningregistry__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__weaklistoffset__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__weakref__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(__xor__);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_abc_impl);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_abstract_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_active);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_align_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_annotation);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_anonymous_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_argtypes_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_as_parameter_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_asyncio_future_blocking);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_blksize);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_bootstrap);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_check_retval_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_dealloc_warn);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_feature_version);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_field_types);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_fields_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_finalizing);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_find_and_load);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_fix_up_module);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_flags_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_get_sourcefile);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_handle_fromlist);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_initializing);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_io);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_is_text_encoding);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_length_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_limbo);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_lock_unlock_module);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_loop);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_needs_com_addref_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_only_immortal);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_pack_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_restype_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_showwarnmsg);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_shutdown);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_slotnames);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_strptime);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_strptime_datetime);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_swappedbytes_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_type_);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_uninitialized_submodules);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_warn_unawaited_coroutine);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(_xoptions);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(abs_tol);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(access);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(aclose);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(add);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(add_done_callback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(after_in_child);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(after_in_parent);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(aggregate_class);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(alias);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(allow_code);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(append);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(arg);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(argdefs);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(args);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(arguments);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(argv);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(as_integer_ratio);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(asend);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(ast);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(athrow);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(attribute);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(authorizer_callback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(autocommit);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(backtick);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(base);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(before);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(big);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(binary_form);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(block);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(bound);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(buffer);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(buffer_callback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(buffer_size);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(buffering);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(buffers);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(bufsize);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(builtins);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(byteorder);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(bytes);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(bytes_per_sep);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(c_call);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(c_exception);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(c_return);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cached_datetime_module);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cached_statements);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cadata);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cafile);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(call);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(call_exception_handler);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(call_soon);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(callback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cancel);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(capath);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(category);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cb_type);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(certfile);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(check_same_thread);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(clear);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(close);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(closed);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(closefd);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(closure);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_argcount);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_cellvars);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_code);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_consts);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_exceptiontable);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_filename);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_firstlineno);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_flags);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_freevars);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_kwonlyargcount);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_linetable);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_name);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_names);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_nlocals);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_posonlyargcount);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_qualname);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_stacksize);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(co_varnames);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(code);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(col_offset);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(command);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(comment_factory);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(compile_mode);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(consts);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(context);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(contravariant);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cookie);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(copy);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(copyreg);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(coro);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(count);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(covariant);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cwd);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(data);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(database);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(day);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(decode);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(decoder);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(default);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(defaultaction);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(delete);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(depth);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(desired_access);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(detect_types);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(deterministic);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(device);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(dict);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(dictcomp);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(difference_update);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(digest);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(digest_size);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(digestmod);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(dir_fd);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(discard);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(dispatch_table);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(displayhook);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(dklen);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(doc);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(dont_inherit);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(dst);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(dst_dir_fd);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(eager_start);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(effective_ids);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(element_factory);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(encode);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(encoding);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(end);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(end_col_offset);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(end_lineno);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(end_offset);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(endpos);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(entrypoint);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(env);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(errors);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(event);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(eventmask);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exc_type);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exc_value);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(excepthook);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exception);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(existing_file_name);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exp);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(extend);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(extra_tokens);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(facility);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(factory);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(false);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(family);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fanout);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fd);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fd2);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fdel);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fget);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(file);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(file_actions);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(filename);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fileno);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(filepath);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fillvalue);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(filter);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(filters);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(final);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(find_class);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fix_imports);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(flags);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(flush);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fold);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(follow_symlinks);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(format);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(from_param);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fromlist);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fromtimestamp);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fromutc);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(fset);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(func);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(future);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(generation);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(genexpr);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(get);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(get_debug);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(get_event_loop);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(get_loop);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(get_source);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(getattr);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(getstate);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(gid);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(globals);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(groupindex);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(groups);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(handle);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(handle_seq);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(has_location);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(hash_name);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(header);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(headers);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(hi);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(hook);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(hour);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(ident);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(identity_hint);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(ignore);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(imag);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(importlib);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(in_fd);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(incoming);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(indexgroup);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(inf);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(infer_variance);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(inherit_handle);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(inheritable);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(initial);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(initial_bytes);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(initial_owner);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(initial_state);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(initial_value);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(initval);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(inner_size);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(input);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(insert_comments);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(insert_pis);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(instructions);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(intern);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(intersection);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(interval);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(is_running);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(isatty);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(isinstance);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(isoformat);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(isolation_level);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(istext);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(item);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(items);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(iter);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(iterable);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(iterations);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(join);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(jump);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(keepends);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(key);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(keyfile);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(keys);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(kind);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(kw);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(kw1);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(kw2);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(kwdefaults);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(label);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(lambda);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(last);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(last_exc);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(last_node);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(last_traceback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(last_type);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(last_value);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(latin1);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(leaf_size);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(len);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(length);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(level);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(limit);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(line);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(line_buffering);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(lineno);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(listcomp);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(little);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(lo);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(locale);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(locals);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(logoption);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(loop);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(manual_reset);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(mapping);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(match);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(max_length);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(maxdigits);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(maxevents);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(maxlen);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(maxmem);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(maxsplit);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(maxvalue);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(memLevel);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(memlimit);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(message);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(metaclass);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(metadata);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(method);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(microsecond);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(milliseconds);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(minute);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(mod);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(mode);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(module);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(module_globals);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(modules);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(month);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(mro);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(msg);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(mutex);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(mycmp);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(n_arg);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(n_fields);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(n_sequence_fields);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(n_unnamed_fields);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(name);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(name_from);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(namespace_separator);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(namespaces);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(narg);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(ndigits);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(nested);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(new_file_name);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(new_limit);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(newline);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(newlines);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(next);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(nlocals);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(node_depth);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(node_offset);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(ns);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(nstype);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(nt);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(null);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(number);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(obj);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(object);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(offset);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(offset_dst);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(offset_src);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(on_type_read);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(onceregistry);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(only_keys);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(oparg);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(opcode);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(open);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(opener);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(operation);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(optimize);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(options);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(order);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(origin);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(out_fd);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(outgoing);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(overlapped);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(owner);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(pages);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(parent);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(password);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(path);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(pattern);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(peek);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(persistent_id);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(persistent_load);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(person);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(pi_factory);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(pid);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(policy);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(pos);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(pos1);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(pos2);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(posix);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(print_file_and_line);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(priority);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(progress);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(progress_handler);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(progress_routine);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(proto);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(protocol);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(ps1);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(ps2);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(query);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(quotetabs);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(raw);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(read);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(read1);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(readable);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(readall);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(readinto);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(readinto1);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(readline);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(readonly);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(real);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(reducer_override);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(registry);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(rel_tol);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(release);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(reload);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(repl);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(replace);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(reserved);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(reset);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(resetids);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(return);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(reverse);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(reversed);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(salt);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sched_priority);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(scheduler);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(second);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(security_attributes);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(seek);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(seekable);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(selectors);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(self);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(send);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sep);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sequence);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(server_hostname);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(server_side);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(session);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(setcomp);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(setpgroup);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(setsid);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(setsigdef);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(setsigmask);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(setstate);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(shape);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(show_cmd);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(signed);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(size);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sizehint);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(skip_file_prefixes);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sleep);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sock);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sort);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(source);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(source_traceback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(spam);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(src);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(src_dir_fd);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stacklevel);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(start);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(statement);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(status);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stderr);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stdin);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stdout);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(step);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(steps);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(store_name);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(strategy);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(strftime);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(strict);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(strict_mode);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(string);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sub_key);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(symmetric_difference_update);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(tabsize);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(tag);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(target);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(target_is_directory);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(task);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(tb_frame);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(tb_lasti);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(tb_lineno);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(tb_next);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(tell);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(template);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(term);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(text);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(threading);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(throw);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(timeout);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(times);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(timetuple);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(top);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(trace_callback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(traceback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(trailers);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(translate);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(true);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(truncate);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(twice);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(txt);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(type);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(type_params);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(tz);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(tzinfo);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(tzname);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(uid);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(unlink);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(unraisablehook);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(uri);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(usedforsecurity);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(value);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(values);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(version);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(volume);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(wait_all);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(warn_on_full_buffer);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(warnings);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(warnoptions);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(wbits);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(week);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(weekday);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(which);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(who);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(withdata);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(writable);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(write);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(write_through);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(year);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(zdict);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(empty);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(dbl_percent);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(dot_locals);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(defaults);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(generic_base);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(kwdefaults);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(type_params);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(str_replace_inf);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(anon_null);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(anon_dictcomp);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(anon_genexpr);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(anon_lambda);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(anon_listcomp);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(anon_module);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(anon_setcomp);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(anon_string);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(anon_unknown);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(json_decoder);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(list_err);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(utf_8);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(dbl_open_br);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_STR(dbl_close_br);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
+}
+/* End auto-generated code */
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UNICODEOBJECT_GENERATED_H */
diff --git a/Include/internal/pycore_unionobject.h b/Include/internal/pycore_unionobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ece7134cdeca030ff4f177d5d62b7a6e0adcabb
--- /dev/null
+++ b/Include/internal/pycore_unionobject.h
@@ -0,0 +1,25 @@
+#ifndef Py_INTERNAL_UNIONOBJECT_H
+#define Py_INTERNAL_UNIONOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// For extensions created by test_peg_generator
+PyAPI_DATA(PyTypeObject) _PyUnion_Type;
+PyAPI_FUNC(PyObject *) _Py_union_type_or(PyObject *, PyObject *);
+
+#define _PyUnion_Check(op) Py_IS_TYPE((op), &_PyUnion_Type)
+
+#define _PyGenericAlias_Check(op) PyObject_TypeCheck((op), &Py_GenericAliasType)
+extern PyObject *_Py_subs_parameters(PyObject *, PyObject *, PyObject *, PyObject *);
+extern PyObject *_Py_make_parameters(PyObject *);
+extern PyObject *_Py_union_args(PyObject *self);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UNIONOBJECT_H */
diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e6ef8e54a221ac7a535fc83048e0806367e9836
--- /dev/null
+++ b/Include/internal/pycore_uop_ids.h
@@ -0,0 +1,294 @@
+// This file is generated by Tools/cases_generator/uop_id_generator.py
+// from:
+//   Python/bytecodes.c
+// Do not edit!
+
+#ifndef Py_CORE_UOP_IDS_H
+#define Py_CORE_UOP_IDS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _EXIT_TRACE 300
+#define _SET_IP 301
+#define _BEFORE_ASYNC_WITH BEFORE_ASYNC_WITH
+#define _BEFORE_WITH BEFORE_WITH
+#define _BINARY_OP 302
+#define _BINARY_OP_ADD_FLOAT 303
+#define _BINARY_OP_ADD_INT 304
+#define _BINARY_OP_ADD_UNICODE 305
+#define _BINARY_OP_MULTIPLY_FLOAT 306
+#define _BINARY_OP_MULTIPLY_INT 307
+#define _BINARY_OP_SUBTRACT_FLOAT 308
+#define _BINARY_OP_SUBTRACT_INT 309
+#define _BINARY_SLICE BINARY_SLICE
+#define _BINARY_SUBSCR 310
+#define _BINARY_SUBSCR_DICT BINARY_SUBSCR_DICT
+#define _BINARY_SUBSCR_GETITEM BINARY_SUBSCR_GETITEM
+#define _BINARY_SUBSCR_LIST_INT BINARY_SUBSCR_LIST_INT
+#define _BINARY_SUBSCR_STR_INT BINARY_SUBSCR_STR_INT
+#define _BINARY_SUBSCR_TUPLE_INT BINARY_SUBSCR_TUPLE_INT
+#define _BUILD_CONST_KEY_MAP BUILD_CONST_KEY_MAP
+#define _BUILD_LIST BUILD_LIST
+#define _BUILD_MAP BUILD_MAP
+#define _BUILD_SET BUILD_SET
+#define _BUILD_SLICE BUILD_SLICE
+#define _BUILD_STRING BUILD_STRING
+#define _BUILD_TUPLE BUILD_TUPLE
+#define _CALL 311
+#define _CALL_ALLOC_AND_ENTER_INIT CALL_ALLOC_AND_ENTER_INIT
+#define _CALL_BUILTIN_CLASS 312
+#define _CALL_BUILTIN_FAST 313
+#define _CALL_BUILTIN_FAST_WITH_KEYWORDS 314
+#define _CALL_BUILTIN_O 315
+#define _CALL_FUNCTION_EX CALL_FUNCTION_EX
+#define _CALL_INTRINSIC_1 CALL_INTRINSIC_1
+#define _CALL_INTRINSIC_2 CALL_INTRINSIC_2
+#define _CALL_ISINSTANCE CALL_ISINSTANCE
+#define _CALL_KW CALL_KW
+#define _CALL_LEN CALL_LEN
+#define _CALL_METHOD_DESCRIPTOR_FAST 316
+#define _CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS 317
+#define _CALL_METHOD_DESCRIPTOR_NOARGS 318
+#define _CALL_METHOD_DESCRIPTOR_O 319
+#define _CALL_NON_PY_GENERAL 320
+#define _CALL_STR_1 321
+#define _CALL_TUPLE_1 322
+#define _CALL_TYPE_1 CALL_TYPE_1
+#define _CHECK_ATTR_CLASS 323
+#define _CHECK_ATTR_METHOD_LAZY_DICT 324
+#define _CHECK_ATTR_MODULE 325
+#define _CHECK_ATTR_WITH_HINT 326
+#define _CHECK_CALL_BOUND_METHOD_EXACT_ARGS 327
+#define _CHECK_EG_MATCH CHECK_EG_MATCH
+#define _CHECK_EXC_MATCH CHECK_EXC_MATCH
+#define _CHECK_FUNCTION 328
+#define _CHECK_FUNCTION_EXACT_ARGS 329
+#define _CHECK_FUNCTION_VERSION 330
+#define _CHECK_IS_NOT_PY_CALLABLE 331
+#define _CHECK_MANAGED_OBJECT_HAS_VALUES 332
+#define _CHECK_METHOD_VERSION 333
+#define _CHECK_PEP_523 334
+#define _CHECK_PERIODIC 335
+#define _CHECK_STACK_SPACE 336
+#define _CHECK_STACK_SPACE_OPERAND 337
+#define _CHECK_VALIDITY 338
+#define _CHECK_VALIDITY_AND_SET_IP 339
+#define _COLD_EXIT 340
+#define _COMPARE_OP 341
+#define _COMPARE_OP_FLOAT 342
+#define _COMPARE_OP_INT 343
+#define _COMPARE_OP_STR 344
+#define _CONTAINS_OP 345
+#define _CONTAINS_OP_DICT CONTAINS_OP_DICT
+#define _CONTAINS_OP_SET CONTAINS_OP_SET
+#define _CONVERT_VALUE CONVERT_VALUE
+#define _COPY COPY
+#define _COPY_FREE_VARS COPY_FREE_VARS
+#define _DELETE_ATTR DELETE_ATTR
+#define _DELETE_DEREF DELETE_DEREF
+#define _DELETE_FAST DELETE_FAST
+#define _DELETE_GLOBAL DELETE_GLOBAL
+#define _DELETE_NAME DELETE_NAME
+#define _DELETE_SUBSCR DELETE_SUBSCR
+#define _DEOPT 346
+#define _DICT_MERGE DICT_MERGE
+#define _DICT_UPDATE DICT_UPDATE
+#define _DYNAMIC_EXIT 347
+#define _END_SEND END_SEND
+#define _ERROR_POP_N 348
+#define _EXIT_INIT_CHECK EXIT_INIT_CHECK
+#define _EXPAND_METHOD 349
+#define _FATAL_ERROR 350
+#define _FORMAT_SIMPLE FORMAT_SIMPLE
+#define _FORMAT_WITH_SPEC FORMAT_WITH_SPEC
+#define _FOR_ITER 351
+#define _FOR_ITER_GEN_FRAME 352
+#define _FOR_ITER_TIER_TWO 353
+#define _GET_AITER GET_AITER
+#define _GET_ANEXT GET_ANEXT
+#define _GET_AWAITABLE GET_AWAITABLE
+#define _GET_ITER GET_ITER
+#define _GET_LEN GET_LEN
+#define _GET_YIELD_FROM_ITER GET_YIELD_FROM_ITER
+#define _GUARD_BOTH_FLOAT 354
+#define _GUARD_BOTH_INT 355
+#define _GUARD_BOTH_UNICODE 356
+#define _GUARD_BUILTINS_VERSION 357
+#define _GUARD_DORV_NO_DICT 358
+#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT 359
+#define _GUARD_GLOBALS_VERSION 360
+#define _GUARD_IS_FALSE_POP 361
+#define _GUARD_IS_NONE_POP 362
+#define _GUARD_IS_NOT_NONE_POP 363
+#define _GUARD_IS_TRUE_POP 364
+#define _GUARD_KEYS_VERSION 365
+#define _GUARD_NOS_FLOAT 366
+#define _GUARD_NOS_INT 367
+#define _GUARD_NOT_EXHAUSTED_LIST 368
+#define _GUARD_NOT_EXHAUSTED_RANGE 369
+#define _GUARD_NOT_EXHAUSTED_TUPLE 370
+#define _GUARD_TOS_FLOAT 371
+#define _GUARD_TOS_INT 372
+#define _GUARD_TYPE_VERSION 373
+#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS 374
+#define _INIT_CALL_PY_EXACT_ARGS 375
+#define _INIT_CALL_PY_EXACT_ARGS_0 376
+#define _INIT_CALL_PY_EXACT_ARGS_1 377
+#define _INIT_CALL_PY_EXACT_ARGS_2 378
+#define _INIT_CALL_PY_EXACT_ARGS_3 379
+#define _INIT_CALL_PY_EXACT_ARGS_4 380
+#define _INSTRUMENTED_CALL INSTRUMENTED_CALL
+#define _INSTRUMENTED_CALL_FUNCTION_EX INSTRUMENTED_CALL_FUNCTION_EX
+#define _INSTRUMENTED_CALL_KW INSTRUMENTED_CALL_KW
+#define _INSTRUMENTED_FOR_ITER INSTRUMENTED_FOR_ITER
+#define _INSTRUMENTED_INSTRUCTION INSTRUMENTED_INSTRUCTION
+#define _INSTRUMENTED_JUMP_BACKWARD INSTRUMENTED_JUMP_BACKWARD
+#define _INSTRUMENTED_JUMP_FORWARD INSTRUMENTED_JUMP_FORWARD
+#define _INSTRUMENTED_LOAD_SUPER_ATTR INSTRUMENTED_LOAD_SUPER_ATTR
+#define _INSTRUMENTED_POP_JUMP_IF_FALSE INSTRUMENTED_POP_JUMP_IF_FALSE
+#define _INSTRUMENTED_POP_JUMP_IF_NONE INSTRUMENTED_POP_JUMP_IF_NONE
+#define _INSTRUMENTED_POP_JUMP_IF_NOT_NONE INSTRUMENTED_POP_JUMP_IF_NOT_NONE
+#define _INSTRUMENTED_POP_JUMP_IF_TRUE INSTRUMENTED_POP_JUMP_IF_TRUE
+#define _INSTRUMENTED_RESUME INSTRUMENTED_RESUME
+#define _INSTRUMENTED_RETURN_CONST INSTRUMENTED_RETURN_CONST
+#define _INSTRUMENTED_RETURN_VALUE INSTRUMENTED_RETURN_VALUE
+#define _INSTRUMENTED_YIELD_VALUE INSTRUMENTED_YIELD_VALUE
+#define _INTERNAL_INCREMENT_OPT_COUNTER 381
+#define _IS_NONE 382
+#define _IS_OP IS_OP
+#define _ITER_CHECK_LIST 383
+#define _ITER_CHECK_RANGE 384
+#define _ITER_CHECK_TUPLE 385
+#define _ITER_JUMP_LIST 386
+#define _ITER_JUMP_RANGE 387
+#define _ITER_JUMP_TUPLE 388
+#define _ITER_NEXT_LIST 389
+#define _ITER_NEXT_RANGE 390
+#define _ITER_NEXT_TUPLE 391
+#define _JUMP_TO_TOP 392
+#define _LIST_APPEND LIST_APPEND
+#define _LIST_EXTEND LIST_EXTEND
+#define _LOAD_ASSERTION_ERROR LOAD_ASSERTION_ERROR
+#define _LOAD_ATTR 393
+#define _LOAD_ATTR_CLASS 394
+#define _LOAD_ATTR_CLASS_0 395
+#define _LOAD_ATTR_CLASS_1 396
+#define _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN
+#define _LOAD_ATTR_INSTANCE_VALUE 397
+#define _LOAD_ATTR_INSTANCE_VALUE_0 398
+#define _LOAD_ATTR_INSTANCE_VALUE_1 399
+#define _LOAD_ATTR_METHOD_LAZY_DICT 400
+#define _LOAD_ATTR_METHOD_NO_DICT 401
+#define _LOAD_ATTR_METHOD_WITH_VALUES 402
+#define _LOAD_ATTR_MODULE 403
+#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT 404
+#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES 405
+#define _LOAD_ATTR_PROPERTY LOAD_ATTR_PROPERTY
+#define _LOAD_ATTR_SLOT 406
+#define _LOAD_ATTR_SLOT_0 407
+#define _LOAD_ATTR_SLOT_1 408
+#define _LOAD_ATTR_WITH_HINT 409
+#define _LOAD_BUILD_CLASS LOAD_BUILD_CLASS
+#define _LOAD_CONST LOAD_CONST
+#define _LOAD_CONST_INLINE 410
+#define _LOAD_CONST_INLINE_BORROW 411
+#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 412
+#define _LOAD_CONST_INLINE_WITH_NULL 413
+#define _LOAD_DEREF LOAD_DEREF
+#define _LOAD_FAST 414
+#define _LOAD_FAST_0 415
+#define _LOAD_FAST_1 416
+#define _LOAD_FAST_2 417
+#define _LOAD_FAST_3 418
+#define _LOAD_FAST_4 419
+#define _LOAD_FAST_5 420
+#define _LOAD_FAST_6 421
+#define _LOAD_FAST_7 422
+#define _LOAD_FAST_AND_CLEAR LOAD_FAST_AND_CLEAR
+#define _LOAD_FAST_CHECK LOAD_FAST_CHECK
+#define _LOAD_FAST_LOAD_FAST LOAD_FAST_LOAD_FAST
+#define _LOAD_FROM_DICT_OR_DEREF LOAD_FROM_DICT_OR_DEREF
+#define _LOAD_FROM_DICT_OR_GLOBALS LOAD_FROM_DICT_OR_GLOBALS
+#define _LOAD_GLOBAL 423
+#define _LOAD_GLOBAL_BUILTINS 424
+#define _LOAD_GLOBAL_MODULE 425
+#define _LOAD_LOCALS LOAD_LOCALS
+#define _LOAD_NAME LOAD_NAME
+#define _LOAD_SUPER_ATTR_ATTR LOAD_SUPER_ATTR_ATTR
+#define _LOAD_SUPER_ATTR_METHOD LOAD_SUPER_ATTR_METHOD
+#define _MAKE_CELL MAKE_CELL
+#define _MAKE_FUNCTION MAKE_FUNCTION
+#define _MAP_ADD MAP_ADD
+#define _MATCH_CLASS MATCH_CLASS
+#define _MATCH_KEYS MATCH_KEYS
+#define _MATCH_MAPPING MATCH_MAPPING
+#define _MATCH_SEQUENCE MATCH_SEQUENCE
+#define _NOP NOP
+#define _POP_EXCEPT POP_EXCEPT
+#define _POP_FRAME 426
+#define _POP_JUMP_IF_FALSE 427
+#define _POP_JUMP_IF_TRUE 428
+#define _POP_TOP POP_TOP
+#define _POP_TOP_LOAD_CONST_INLINE_BORROW 429
+#define _PUSH_EXC_INFO PUSH_EXC_INFO
+#define _PUSH_FRAME 430
+#define _PUSH_NULL PUSH_NULL
+#define _PY_FRAME_GENERAL 431
+#define _REPLACE_WITH_TRUE 432
+#define _RESUME_CHECK RESUME_CHECK
+#define _RETURN_GENERATOR RETURN_GENERATOR
+#define _SAVE_RETURN_OFFSET 433
+#define _SEND 434
+#define _SEND_GEN SEND_GEN
+#define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS
+#define _SET_ADD SET_ADD
+#define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE
+#define _SET_UPDATE SET_UPDATE
+#define _START_EXECUTOR 435
+#define _STORE_ATTR 436
+#define _STORE_ATTR_INSTANCE_VALUE 437
+#define _STORE_ATTR_SLOT 438
+#define _STORE_ATTR_WITH_HINT STORE_ATTR_WITH_HINT
+#define _STORE_DEREF STORE_DEREF
+#define _STORE_FAST 439
+#define _STORE_FAST_0 440
+#define _STORE_FAST_1 441
+#define _STORE_FAST_2 442
+#define _STORE_FAST_3 443
+#define _STORE_FAST_4 444
+#define _STORE_FAST_5 445
+#define _STORE_FAST_6 446
+#define _STORE_FAST_7 447
+#define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST
+#define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST
+#define _STORE_GLOBAL STORE_GLOBAL
+#define _STORE_NAME STORE_NAME
+#define _STORE_SLICE STORE_SLICE
+#define _STORE_SUBSCR 448
+#define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT
+#define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT
+#define _SWAP SWAP
+#define _TIER2_RESUME_CHECK 449
+#define _TO_BOOL 450
+#define _TO_BOOL_BOOL TO_BOOL_BOOL
+#define _TO_BOOL_INT TO_BOOL_INT
+#define _TO_BOOL_LIST TO_BOOL_LIST
+#define _TO_BOOL_NONE TO_BOOL_NONE
+#define _TO_BOOL_STR TO_BOOL_STR
+#define _UNARY_INVERT UNARY_INVERT
+#define _UNARY_NEGATIVE UNARY_NEGATIVE
+#define _UNARY_NOT UNARY_NOT
+#define _UNPACK_EX UNPACK_EX
+#define _UNPACK_SEQUENCE 451
+#define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST
+#define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE
+#define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE
+#define _WITH_EXCEPT_START WITH_EXCEPT_START
+#define _YIELD_VALUE YIELD_VALUE
+#define MAX_UOP_ID 451
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CORE_UOP_IDS_H */
diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..02ffc769c180eaa09e28b9f6c0a1b199be5b0f37
--- /dev/null
+++ b/Include/internal/pycore_uop_metadata.h
@@ -0,0 +1,1006 @@
+// This file is generated by Tools/cases_generator/uop_metadata_generator.py
+// from:
+//   Python/bytecodes.c
+// Do not edit!
+
+#ifndef Py_CORE_UOP_METADATA_H
+#define Py_CORE_UOP_METADATA_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include "pycore_uop_ids.h"
+extern const uint16_t _PyUop_Flags[MAX_UOP_ID+1];
+extern const uint8_t _PyUop_Replication[MAX_UOP_ID+1];
+extern const char * const _PyOpcode_uop_name[MAX_UOP_ID+1];
+
+extern int _PyUop_num_popped(int opcode, int oparg);
+
+#ifdef NEED_OPCODE_METADATA
+const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = {
+    [_NOP] = HAS_PURE_FLAG,
+    [_RESUME_CHECK] = HAS_DEOPT_FLAG,
+    [_LOAD_FAST_CHECK] = HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_LOAD_FAST_0] = HAS_LOCAL_FLAG | HAS_PURE_FLAG,
+    [_LOAD_FAST_1] = HAS_LOCAL_FLAG | HAS_PURE_FLAG,
+    [_LOAD_FAST_2] = HAS_LOCAL_FLAG | HAS_PURE_FLAG,
+    [_LOAD_FAST_3] = HAS_LOCAL_FLAG | HAS_PURE_FLAG,
+    [_LOAD_FAST_4] = HAS_LOCAL_FLAG | HAS_PURE_FLAG,
+    [_LOAD_FAST_5] = HAS_LOCAL_FLAG | HAS_PURE_FLAG,
+    [_LOAD_FAST_6] = HAS_LOCAL_FLAG | HAS_PURE_FLAG,
+    [_LOAD_FAST_7] = HAS_LOCAL_FLAG | HAS_PURE_FLAG,
+    [_LOAD_FAST] = HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_PURE_FLAG,
+    [_LOAD_FAST_AND_CLEAR] = HAS_ARG_FLAG | HAS_LOCAL_FLAG,
+    [_LOAD_FAST_LOAD_FAST] = HAS_ARG_FLAG | HAS_LOCAL_FLAG,
+    [_LOAD_CONST] = HAS_ARG_FLAG | HAS_CONST_FLAG | HAS_PURE_FLAG,
+    [_STORE_FAST_0] = HAS_LOCAL_FLAG,
+    [_STORE_FAST_1] = HAS_LOCAL_FLAG,
+    [_STORE_FAST_2] = HAS_LOCAL_FLAG,
+    [_STORE_FAST_3] = HAS_LOCAL_FLAG,
+    [_STORE_FAST_4] = HAS_LOCAL_FLAG,
+    [_STORE_FAST_5] = HAS_LOCAL_FLAG,
+    [_STORE_FAST_6] = HAS_LOCAL_FLAG,
+    [_STORE_FAST_7] = HAS_LOCAL_FLAG,
+    [_STORE_FAST] = HAS_ARG_FLAG | HAS_LOCAL_FLAG,
+    [_STORE_FAST_LOAD_FAST] = HAS_ARG_FLAG | HAS_LOCAL_FLAG,
+    [_STORE_FAST_STORE_FAST] = HAS_ARG_FLAG | HAS_LOCAL_FLAG,
+    [_POP_TOP] = HAS_PURE_FLAG,
+    [_PUSH_NULL] = HAS_PURE_FLAG,
+    [_END_SEND] = HAS_PURE_FLAG,
+    [_UNARY_NEGATIVE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_UNARY_NOT] = HAS_PURE_FLAG,
+    [_TO_BOOL] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_TO_BOOL_BOOL] = HAS_EXIT_FLAG,
+    [_TO_BOOL_INT] = HAS_EXIT_FLAG | HAS_ESCAPES_FLAG,
+    [_TO_BOOL_LIST] = HAS_EXIT_FLAG,
+    [_TO_BOOL_NONE] = HAS_EXIT_FLAG,
+    [_TO_BOOL_STR] = HAS_EXIT_FLAG | HAS_ESCAPES_FLAG,
+    [_REPLACE_WITH_TRUE] = 0,
+    [_UNARY_INVERT] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_GUARD_BOTH_INT] = HAS_EXIT_FLAG,
+    [_GUARD_NOS_INT] = HAS_EXIT_FLAG,
+    [_GUARD_TOS_INT] = HAS_EXIT_FLAG,
+    [_BINARY_OP_MULTIPLY_INT] = HAS_ERROR_FLAG | HAS_PURE_FLAG,
+    [_BINARY_OP_ADD_INT] = HAS_ERROR_FLAG | HAS_PURE_FLAG,
+    [_BINARY_OP_SUBTRACT_INT] = HAS_ERROR_FLAG | HAS_PURE_FLAG,
+    [_GUARD_BOTH_FLOAT] = HAS_EXIT_FLAG,
+    [_GUARD_NOS_FLOAT] = HAS_EXIT_FLAG,
+    [_GUARD_TOS_FLOAT] = HAS_EXIT_FLAG,
+    [_BINARY_OP_MULTIPLY_FLOAT] = HAS_PURE_FLAG,
+    [_BINARY_OP_ADD_FLOAT] = HAS_PURE_FLAG,
+    [_BINARY_OP_SUBTRACT_FLOAT] = HAS_PURE_FLAG,
+    [_GUARD_BOTH_UNICODE] = HAS_EXIT_FLAG,
+    [_BINARY_OP_ADD_UNICODE] = HAS_ERROR_FLAG | HAS_PURE_FLAG,
+    [_BINARY_SUBSCR] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_BINARY_SLICE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_STORE_SLICE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_BINARY_SUBSCR_LIST_INT] = HAS_DEOPT_FLAG,
+    [_BINARY_SUBSCR_STR_INT] = HAS_DEOPT_FLAG,
+    [_BINARY_SUBSCR_TUPLE_INT] = HAS_DEOPT_FLAG,
+    [_BINARY_SUBSCR_DICT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_LIST_APPEND] = HAS_ARG_FLAG | HAS_ERROR_FLAG,
+    [_SET_ADD] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_STORE_SUBSCR] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_STORE_SUBSCR_LIST_INT] = HAS_DEOPT_FLAG,
+    [_STORE_SUBSCR_DICT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_DELETE_SUBSCR] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_INTRINSIC_1] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_INTRINSIC_2] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_POP_FRAME] = 0,
+    [_GET_AITER] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_GET_ANEXT] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_GET_AWAITABLE] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_YIELD_VALUE] = HAS_ARG_FLAG | HAS_ESCAPES_FLAG,
+    [_POP_EXCEPT] = HAS_ESCAPES_FLAG,
+    [_LOAD_ASSERTION_ERROR] = 0,
+    [_LOAD_BUILD_CLASS] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_STORE_NAME] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_DELETE_NAME] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_UNPACK_SEQUENCE] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_UNPACK_SEQUENCE_TWO_TUPLE] = HAS_ARG_FLAG | HAS_DEOPT_FLAG,
+    [_UNPACK_SEQUENCE_TUPLE] = HAS_ARG_FLAG | HAS_DEOPT_FLAG,
+    [_UNPACK_SEQUENCE_LIST] = HAS_ARG_FLAG | HAS_DEOPT_FLAG,
+    [_UNPACK_EX] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_STORE_ATTR] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_DELETE_ATTR] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_STORE_GLOBAL] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_DELETE_GLOBAL] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_LOAD_LOCALS] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_LOAD_GLOBAL] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_GUARD_GLOBALS_VERSION] = HAS_DEOPT_FLAG,
+    [_GUARD_BUILTINS_VERSION] = HAS_DEOPT_FLAG,
+    [_LOAD_GLOBAL_MODULE] = HAS_ARG_FLAG | HAS_DEOPT_FLAG,
+    [_LOAD_GLOBAL_BUILTINS] = HAS_ARG_FLAG | HAS_DEOPT_FLAG,
+    [_DELETE_FAST] = HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_MAKE_CELL] = HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG,
+    [_DELETE_DEREF] = HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_LOAD_FROM_DICT_OR_DEREF] = HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_LOAD_DEREF] = HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_STORE_DEREF] = HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ESCAPES_FLAG,
+    [_COPY_FREE_VARS] = HAS_ARG_FLAG,
+    [_BUILD_STRING] = HAS_ARG_FLAG | HAS_ERROR_FLAG,
+    [_BUILD_TUPLE] = HAS_ARG_FLAG | HAS_ERROR_FLAG,
+    [_BUILD_LIST] = HAS_ARG_FLAG | HAS_ERROR_FLAG,
+    [_LIST_EXTEND] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_SET_UPDATE] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_BUILD_MAP] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_SETUP_ANNOTATIONS] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_BUILD_CONST_KEY_MAP] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_DICT_UPDATE] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_DICT_MERGE] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_MAP_ADD] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_LOAD_SUPER_ATTR_ATTR] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_LOAD_SUPER_ATTR_METHOD] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_LOAD_ATTR] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_GUARD_TYPE_VERSION] = HAS_EXIT_FLAG,
+    [_CHECK_MANAGED_OBJECT_HAS_VALUES] = HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_INSTANCE_VALUE_0] = HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_INSTANCE_VALUE_1] = HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_INSTANCE_VALUE] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_OPARG_AND_1_FLAG,
+    [_CHECK_ATTR_MODULE] = HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_MODULE] = HAS_ARG_FLAG | HAS_DEOPT_FLAG,
+    [_CHECK_ATTR_WITH_HINT] = HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_WITH_HINT] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_SLOT_0] = HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_SLOT_1] = HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_SLOT] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_OPARG_AND_1_FLAG,
+    [_CHECK_ATTR_CLASS] = HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_CLASS_0] = 0,
+    [_LOAD_ATTR_CLASS_1] = 0,
+    [_LOAD_ATTR_CLASS] = HAS_ARG_FLAG | HAS_OPARG_AND_1_FLAG,
+    [_GUARD_DORV_NO_DICT] = HAS_DEOPT_FLAG,
+    [_STORE_ATTR_INSTANCE_VALUE] = 0,
+    [_STORE_ATTR_SLOT] = 0,
+    [_COMPARE_OP] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_COMPARE_OP_FLOAT] = HAS_ARG_FLAG,
+    [_COMPARE_OP_INT] = HAS_ARG_FLAG | HAS_DEOPT_FLAG,
+    [_COMPARE_OP_STR] = HAS_ARG_FLAG,
+    [_IS_OP] = HAS_ARG_FLAG,
+    [_CONTAINS_OP] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CONTAINS_OP_SET] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CONTAINS_OP_DICT] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CHECK_EG_MATCH] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CHECK_EXC_MATCH] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_IS_NONE] = 0,
+    [_GET_LEN] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_MATCH_CLASS] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_MATCH_MAPPING] = 0,
+    [_MATCH_SEQUENCE] = 0,
+    [_MATCH_KEYS] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_GET_ITER] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_GET_YIELD_FROM_ITER] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_FOR_ITER_TIER_TWO] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_ITER_CHECK_LIST] = HAS_EXIT_FLAG,
+    [_GUARD_NOT_EXHAUSTED_LIST] = HAS_EXIT_FLAG,
+    [_ITER_NEXT_LIST] = 0,
+    [_ITER_CHECK_TUPLE] = HAS_EXIT_FLAG,
+    [_GUARD_NOT_EXHAUSTED_TUPLE] = HAS_EXIT_FLAG,
+    [_ITER_NEXT_TUPLE] = 0,
+    [_ITER_CHECK_RANGE] = HAS_EXIT_FLAG,
+    [_GUARD_NOT_EXHAUSTED_RANGE] = HAS_EXIT_FLAG,
+    [_ITER_NEXT_RANGE] = HAS_ERROR_FLAG,
+    [_FOR_ITER_GEN_FRAME] = HAS_ARG_FLAG | HAS_DEOPT_FLAG,
+    [_WITH_EXCEPT_START] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_PUSH_EXC_INFO] = 0,
+    [_GUARD_DORV_VALUES_INST_ATTR_FROM_DICT] = HAS_DEOPT_FLAG,
+    [_GUARD_KEYS_VERSION] = HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_METHOD_WITH_VALUES] = HAS_ARG_FLAG,
+    [_LOAD_ATTR_METHOD_NO_DICT] = HAS_ARG_FLAG,
+    [_LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = HAS_ARG_FLAG,
+    [_LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = HAS_ARG_FLAG,
+    [_CHECK_ATTR_METHOD_LAZY_DICT] = HAS_DEOPT_FLAG,
+    [_LOAD_ATTR_METHOD_LAZY_DICT] = HAS_ARG_FLAG,
+    [_CHECK_PERIODIC] = HAS_EVAL_BREAK_FLAG,
+    [_PY_FRAME_GENERAL] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_CHECK_FUNCTION_VERSION] = HAS_ARG_FLAG | HAS_EXIT_FLAG,
+    [_CHECK_METHOD_VERSION] = HAS_ARG_FLAG | HAS_EXIT_FLAG,
+    [_EXPAND_METHOD] = HAS_ARG_FLAG,
+    [_CHECK_IS_NOT_PY_CALLABLE] = HAS_ARG_FLAG | HAS_EXIT_FLAG,
+    [_CALL_NON_PY_GENERAL] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CHECK_CALL_BOUND_METHOD_EXACT_ARGS] = HAS_ARG_FLAG | HAS_EXIT_FLAG,
+    [_INIT_CALL_BOUND_METHOD_EXACT_ARGS] = HAS_ARG_FLAG,
+    [_CHECK_PEP_523] = HAS_DEOPT_FLAG,
+    [_CHECK_FUNCTION_EXACT_ARGS] = HAS_ARG_FLAG | HAS_EXIT_FLAG,
+    [_CHECK_STACK_SPACE] = HAS_ARG_FLAG | HAS_DEOPT_FLAG,
+    [_INIT_CALL_PY_EXACT_ARGS_0] = HAS_PURE_FLAG,
+    [_INIT_CALL_PY_EXACT_ARGS_1] = HAS_PURE_FLAG,
+    [_INIT_CALL_PY_EXACT_ARGS_2] = HAS_PURE_FLAG,
+    [_INIT_CALL_PY_EXACT_ARGS_3] = HAS_PURE_FLAG,
+    [_INIT_CALL_PY_EXACT_ARGS_4] = HAS_PURE_FLAG,
+    [_INIT_CALL_PY_EXACT_ARGS] = HAS_ARG_FLAG | HAS_PURE_FLAG,
+    [_PUSH_FRAME] = 0,
+    [_CALL_TYPE_1] = HAS_ARG_FLAG | HAS_DEOPT_FLAG,
+    [_CALL_STR_1] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_TUPLE_1] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_EXIT_INIT_CHECK] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_BUILTIN_CLASS] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_BUILTIN_O] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_BUILTIN_FAST] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_BUILTIN_FAST_WITH_KEYWORDS] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_LEN] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_ISINSTANCE] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_METHOD_DESCRIPTOR_O] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_METHOD_DESCRIPTOR_NOARGS] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_CALL_METHOD_DESCRIPTOR_FAST] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_MAKE_FUNCTION] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_SET_FUNCTION_ATTRIBUTE] = HAS_ARG_FLAG | HAS_ESCAPES_FLAG,
+    [_RETURN_GENERATOR] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_BUILD_SLICE] = HAS_ARG_FLAG | HAS_ERROR_FLAG,
+    [_CONVERT_VALUE] = HAS_ARG_FLAG | HAS_ERROR_FLAG,
+    [_FORMAT_SIMPLE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_FORMAT_WITH_SPEC] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_COPY] = HAS_ARG_FLAG | HAS_PURE_FLAG,
+    [_BINARY_OP] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
+    [_SWAP] = HAS_ARG_FLAG | HAS_PURE_FLAG,
+    [_GUARD_IS_TRUE_POP] = HAS_EXIT_FLAG,
+    [_GUARD_IS_FALSE_POP] = HAS_EXIT_FLAG,
+    [_GUARD_IS_NONE_POP] = HAS_EXIT_FLAG,
+    [_GUARD_IS_NOT_NONE_POP] = HAS_EXIT_FLAG,
+    [_JUMP_TO_TOP] = 0,
+    [_SET_IP] = 0,
+    [_CHECK_STACK_SPACE_OPERAND] = HAS_DEOPT_FLAG,
+    [_SAVE_RETURN_OFFSET] = HAS_ARG_FLAG,
+    [_EXIT_TRACE] = 0,
+    [_CHECK_VALIDITY] = HAS_DEOPT_FLAG,
+    [_LOAD_CONST_INLINE] = HAS_PURE_FLAG,
+    [_LOAD_CONST_INLINE_BORROW] = HAS_PURE_FLAG,
+    [_POP_TOP_LOAD_CONST_INLINE_BORROW] = HAS_PURE_FLAG,
+    [_LOAD_CONST_INLINE_WITH_NULL] = HAS_PURE_FLAG,
+    [_LOAD_CONST_INLINE_BORROW_WITH_NULL] = HAS_PURE_FLAG,
+    [_CHECK_FUNCTION] = HAS_DEOPT_FLAG,
+    [_INTERNAL_INCREMENT_OPT_COUNTER] = 0,
+    [_COLD_EXIT] = HAS_ARG_FLAG | HAS_ESCAPES_FLAG,
+    [_DYNAMIC_EXIT] = HAS_ARG_FLAG | HAS_ESCAPES_FLAG,
+    [_START_EXECUTOR] = HAS_DEOPT_FLAG,
+    [_FATAL_ERROR] = 0,
+    [_CHECK_VALIDITY_AND_SET_IP] = HAS_DEOPT_FLAG,
+    [_DEOPT] = 0,
+    [_ERROR_POP_N] = HAS_ARG_FLAG,
+    [_TIER2_RESUME_CHECK] = HAS_DEOPT_FLAG,
+};
+
+const uint8_t _PyUop_Replication[MAX_UOP_ID+1] = {
+    [_LOAD_FAST] = 8,
+    [_STORE_FAST] = 8,
+    [_INIT_CALL_PY_EXACT_ARGS] = 5,
+};
+
+const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = {
+    [_BINARY_OP] = "_BINARY_OP",
+    [_BINARY_OP_ADD_FLOAT] = "_BINARY_OP_ADD_FLOAT",
+    [_BINARY_OP_ADD_INT] = "_BINARY_OP_ADD_INT",
+    [_BINARY_OP_ADD_UNICODE] = "_BINARY_OP_ADD_UNICODE",
+    [_BINARY_OP_MULTIPLY_FLOAT] = "_BINARY_OP_MULTIPLY_FLOAT",
+    [_BINARY_OP_MULTIPLY_INT] = "_BINARY_OP_MULTIPLY_INT",
+    [_BINARY_OP_SUBTRACT_FLOAT] = "_BINARY_OP_SUBTRACT_FLOAT",
+    [_BINARY_OP_SUBTRACT_INT] = "_BINARY_OP_SUBTRACT_INT",
+    [_BINARY_SLICE] = "_BINARY_SLICE",
+    [_BINARY_SUBSCR] = "_BINARY_SUBSCR",
+    [_BINARY_SUBSCR_DICT] = "_BINARY_SUBSCR_DICT",
+    [_BINARY_SUBSCR_LIST_INT] = "_BINARY_SUBSCR_LIST_INT",
+    [_BINARY_SUBSCR_STR_INT] = "_BINARY_SUBSCR_STR_INT",
+    [_BINARY_SUBSCR_TUPLE_INT] = "_BINARY_SUBSCR_TUPLE_INT",
+    [_BUILD_CONST_KEY_MAP] = "_BUILD_CONST_KEY_MAP",
+    [_BUILD_LIST] = "_BUILD_LIST",
+    [_BUILD_MAP] = "_BUILD_MAP",
+    [_BUILD_SLICE] = "_BUILD_SLICE",
+    [_BUILD_STRING] = "_BUILD_STRING",
+    [_BUILD_TUPLE] = "_BUILD_TUPLE",
+    [_CALL_BUILTIN_CLASS] = "_CALL_BUILTIN_CLASS",
+    [_CALL_BUILTIN_FAST] = "_CALL_BUILTIN_FAST",
+    [_CALL_BUILTIN_FAST_WITH_KEYWORDS] = "_CALL_BUILTIN_FAST_WITH_KEYWORDS",
+    [_CALL_BUILTIN_O] = "_CALL_BUILTIN_O",
+    [_CALL_INTRINSIC_1] = "_CALL_INTRINSIC_1",
+    [_CALL_INTRINSIC_2] = "_CALL_INTRINSIC_2",
+    [_CALL_ISINSTANCE] = "_CALL_ISINSTANCE",
+    [_CALL_LEN] = "_CALL_LEN",
+    [_CALL_METHOD_DESCRIPTOR_FAST] = "_CALL_METHOD_DESCRIPTOR_FAST",
+    [_CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS] = "_CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS",
+    [_CALL_METHOD_DESCRIPTOR_NOARGS] = "_CALL_METHOD_DESCRIPTOR_NOARGS",
+    [_CALL_METHOD_DESCRIPTOR_O] = "_CALL_METHOD_DESCRIPTOR_O",
+    [_CALL_NON_PY_GENERAL] = "_CALL_NON_PY_GENERAL",
+    [_CALL_STR_1] = "_CALL_STR_1",
+    [_CALL_TUPLE_1] = "_CALL_TUPLE_1",
+    [_CALL_TYPE_1] = "_CALL_TYPE_1",
+    [_CHECK_ATTR_CLASS] = "_CHECK_ATTR_CLASS",
+    [_CHECK_ATTR_METHOD_LAZY_DICT] = "_CHECK_ATTR_METHOD_LAZY_DICT",
+    [_CHECK_ATTR_MODULE] = "_CHECK_ATTR_MODULE",
+    [_CHECK_ATTR_WITH_HINT] = "_CHECK_ATTR_WITH_HINT",
+    [_CHECK_CALL_BOUND_METHOD_EXACT_ARGS] = "_CHECK_CALL_BOUND_METHOD_EXACT_ARGS",
+    [_CHECK_EG_MATCH] = "_CHECK_EG_MATCH",
+    [_CHECK_EXC_MATCH] = "_CHECK_EXC_MATCH",
+    [_CHECK_FUNCTION] = "_CHECK_FUNCTION",
+    [_CHECK_FUNCTION_EXACT_ARGS] = "_CHECK_FUNCTION_EXACT_ARGS",
+    [_CHECK_FUNCTION_VERSION] = "_CHECK_FUNCTION_VERSION",
+    [_CHECK_IS_NOT_PY_CALLABLE] = "_CHECK_IS_NOT_PY_CALLABLE",
+    [_CHECK_MANAGED_OBJECT_HAS_VALUES] = "_CHECK_MANAGED_OBJECT_HAS_VALUES",
+    [_CHECK_METHOD_VERSION] = "_CHECK_METHOD_VERSION",
+    [_CHECK_PEP_523] = "_CHECK_PEP_523",
+    [_CHECK_PERIODIC] = "_CHECK_PERIODIC",
+    [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE",
+    [_CHECK_STACK_SPACE_OPERAND] = "_CHECK_STACK_SPACE_OPERAND",
+    [_CHECK_VALIDITY] = "_CHECK_VALIDITY",
+    [_CHECK_VALIDITY_AND_SET_IP] = "_CHECK_VALIDITY_AND_SET_IP",
+    [_COLD_EXIT] = "_COLD_EXIT",
+    [_COMPARE_OP] = "_COMPARE_OP",
+    [_COMPARE_OP_FLOAT] = "_COMPARE_OP_FLOAT",
+    [_COMPARE_OP_INT] = "_COMPARE_OP_INT",
+    [_COMPARE_OP_STR] = "_COMPARE_OP_STR",
+    [_CONTAINS_OP] = "_CONTAINS_OP",
+    [_CONTAINS_OP_DICT] = "_CONTAINS_OP_DICT",
+    [_CONTAINS_OP_SET] = "_CONTAINS_OP_SET",
+    [_CONVERT_VALUE] = "_CONVERT_VALUE",
+    [_COPY] = "_COPY",
+    [_COPY_FREE_VARS] = "_COPY_FREE_VARS",
+    [_DELETE_ATTR] = "_DELETE_ATTR",
+    [_DELETE_DEREF] = "_DELETE_DEREF",
+    [_DELETE_FAST] = "_DELETE_FAST",
+    [_DELETE_GLOBAL] = "_DELETE_GLOBAL",
+    [_DELETE_NAME] = "_DELETE_NAME",
+    [_DELETE_SUBSCR] = "_DELETE_SUBSCR",
+    [_DEOPT] = "_DEOPT",
+    [_DICT_MERGE] = "_DICT_MERGE",
+    [_DICT_UPDATE] = "_DICT_UPDATE",
+    [_DYNAMIC_EXIT] = "_DYNAMIC_EXIT",
+    [_END_SEND] = "_END_SEND",
+    [_ERROR_POP_N] = "_ERROR_POP_N",
+    [_EXIT_INIT_CHECK] = "_EXIT_INIT_CHECK",
+    [_EXIT_TRACE] = "_EXIT_TRACE",
+    [_EXPAND_METHOD] = "_EXPAND_METHOD",
+    [_FATAL_ERROR] = "_FATAL_ERROR",
+    [_FORMAT_SIMPLE] = "_FORMAT_SIMPLE",
+    [_FORMAT_WITH_SPEC] = "_FORMAT_WITH_SPEC",
+    [_FOR_ITER_GEN_FRAME] = "_FOR_ITER_GEN_FRAME",
+    [_FOR_ITER_TIER_TWO] = "_FOR_ITER_TIER_TWO",
+    [_GET_AITER] = "_GET_AITER",
+    [_GET_ANEXT] = "_GET_ANEXT",
+    [_GET_AWAITABLE] = "_GET_AWAITABLE",
+    [_GET_ITER] = "_GET_ITER",
+    [_GET_LEN] = "_GET_LEN",
+    [_GET_YIELD_FROM_ITER] = "_GET_YIELD_FROM_ITER",
+    [_GUARD_BOTH_FLOAT] = "_GUARD_BOTH_FLOAT",
+    [_GUARD_BOTH_INT] = "_GUARD_BOTH_INT",
+    [_GUARD_BOTH_UNICODE] = "_GUARD_BOTH_UNICODE",
+    [_GUARD_BUILTINS_VERSION] = "_GUARD_BUILTINS_VERSION",
+    [_GUARD_DORV_NO_DICT] = "_GUARD_DORV_NO_DICT",
+    [_GUARD_DORV_VALUES_INST_ATTR_FROM_DICT] = "_GUARD_DORV_VALUES_INST_ATTR_FROM_DICT",
+    [_GUARD_GLOBALS_VERSION] = "_GUARD_GLOBALS_VERSION",
+    [_GUARD_IS_FALSE_POP] = "_GUARD_IS_FALSE_POP",
+    [_GUARD_IS_NONE_POP] = "_GUARD_IS_NONE_POP",
+    [_GUARD_IS_NOT_NONE_POP] = "_GUARD_IS_NOT_NONE_POP",
+    [_GUARD_IS_TRUE_POP] = "_GUARD_IS_TRUE_POP",
+    [_GUARD_KEYS_VERSION] = "_GUARD_KEYS_VERSION",
+    [_GUARD_NOS_FLOAT] = "_GUARD_NOS_FLOAT",
+    [_GUARD_NOS_INT] = "_GUARD_NOS_INT",
+    [_GUARD_NOT_EXHAUSTED_LIST] = "_GUARD_NOT_EXHAUSTED_LIST",
+    [_GUARD_NOT_EXHAUSTED_RANGE] = "_GUARD_NOT_EXHAUSTED_RANGE",
+    [_GUARD_NOT_EXHAUSTED_TUPLE] = "_GUARD_NOT_EXHAUSTED_TUPLE",
+    [_GUARD_TOS_FLOAT] = "_GUARD_TOS_FLOAT",
+    [_GUARD_TOS_INT] = "_GUARD_TOS_INT",
+    [_GUARD_TYPE_VERSION] = "_GUARD_TYPE_VERSION",
+    [_INIT_CALL_BOUND_METHOD_EXACT_ARGS] = "_INIT_CALL_BOUND_METHOD_EXACT_ARGS",
+    [_INIT_CALL_PY_EXACT_ARGS] = "_INIT_CALL_PY_EXACT_ARGS",
+    [_INIT_CALL_PY_EXACT_ARGS_0] = "_INIT_CALL_PY_EXACT_ARGS_0",
+    [_INIT_CALL_PY_EXACT_ARGS_1] = "_INIT_CALL_PY_EXACT_ARGS_1",
+    [_INIT_CALL_PY_EXACT_ARGS_2] = "_INIT_CALL_PY_EXACT_ARGS_2",
+    [_INIT_CALL_PY_EXACT_ARGS_3] = "_INIT_CALL_PY_EXACT_ARGS_3",
+    [_INIT_CALL_PY_EXACT_ARGS_4] = "_INIT_CALL_PY_EXACT_ARGS_4",
+    [_INTERNAL_INCREMENT_OPT_COUNTER] = "_INTERNAL_INCREMENT_OPT_COUNTER",
+    [_IS_NONE] = "_IS_NONE",
+    [_IS_OP] = "_IS_OP",
+    [_ITER_CHECK_LIST] = "_ITER_CHECK_LIST",
+    [_ITER_CHECK_RANGE] = "_ITER_CHECK_RANGE",
+    [_ITER_CHECK_TUPLE] = "_ITER_CHECK_TUPLE",
+    [_ITER_NEXT_LIST] = "_ITER_NEXT_LIST",
+    [_ITER_NEXT_RANGE] = "_ITER_NEXT_RANGE",
+    [_ITER_NEXT_TUPLE] = "_ITER_NEXT_TUPLE",
+    [_JUMP_TO_TOP] = "_JUMP_TO_TOP",
+    [_LIST_APPEND] = "_LIST_APPEND",
+    [_LIST_EXTEND] = "_LIST_EXTEND",
+    [_LOAD_ASSERTION_ERROR] = "_LOAD_ASSERTION_ERROR",
+    [_LOAD_ATTR] = "_LOAD_ATTR",
+    [_LOAD_ATTR_CLASS] = "_LOAD_ATTR_CLASS",
+    [_LOAD_ATTR_CLASS_0] = "_LOAD_ATTR_CLASS_0",
+    [_LOAD_ATTR_CLASS_1] = "_LOAD_ATTR_CLASS_1",
+    [_LOAD_ATTR_INSTANCE_VALUE] = "_LOAD_ATTR_INSTANCE_VALUE",
+    [_LOAD_ATTR_INSTANCE_VALUE_0] = "_LOAD_ATTR_INSTANCE_VALUE_0",
+    [_LOAD_ATTR_INSTANCE_VALUE_1] = "_LOAD_ATTR_INSTANCE_VALUE_1",
+    [_LOAD_ATTR_METHOD_LAZY_DICT] = "_LOAD_ATTR_METHOD_LAZY_DICT",
+    [_LOAD_ATTR_METHOD_NO_DICT] = "_LOAD_ATTR_METHOD_NO_DICT",
+    [_LOAD_ATTR_METHOD_WITH_VALUES] = "_LOAD_ATTR_METHOD_WITH_VALUES",
+    [_LOAD_ATTR_MODULE] = "_LOAD_ATTR_MODULE",
+    [_LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = "_LOAD_ATTR_NONDESCRIPTOR_NO_DICT",
+    [_LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = "_LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES",
+    [_LOAD_ATTR_SLOT] = "_LOAD_ATTR_SLOT",
+    [_LOAD_ATTR_SLOT_0] = "_LOAD_ATTR_SLOT_0",
+    [_LOAD_ATTR_SLOT_1] = "_LOAD_ATTR_SLOT_1",
+    [_LOAD_ATTR_WITH_HINT] = "_LOAD_ATTR_WITH_HINT",
+    [_LOAD_BUILD_CLASS] = "_LOAD_BUILD_CLASS",
+    [_LOAD_CONST] = "_LOAD_CONST",
+    [_LOAD_CONST_INLINE] = "_LOAD_CONST_INLINE",
+    [_LOAD_CONST_INLINE_BORROW] = "_LOAD_CONST_INLINE_BORROW",
+    [_LOAD_CONST_INLINE_BORROW_WITH_NULL] = "_LOAD_CONST_INLINE_BORROW_WITH_NULL",
+    [_LOAD_CONST_INLINE_WITH_NULL] = "_LOAD_CONST_INLINE_WITH_NULL",
+    [_LOAD_DEREF] = "_LOAD_DEREF",
+    [_LOAD_FAST] = "_LOAD_FAST",
+    [_LOAD_FAST_0] = "_LOAD_FAST_0",
+    [_LOAD_FAST_1] = "_LOAD_FAST_1",
+    [_LOAD_FAST_2] = "_LOAD_FAST_2",
+    [_LOAD_FAST_3] = "_LOAD_FAST_3",
+    [_LOAD_FAST_4] = "_LOAD_FAST_4",
+    [_LOAD_FAST_5] = "_LOAD_FAST_5",
+    [_LOAD_FAST_6] = "_LOAD_FAST_6",
+    [_LOAD_FAST_7] = "_LOAD_FAST_7",
+    [_LOAD_FAST_AND_CLEAR] = "_LOAD_FAST_AND_CLEAR",
+    [_LOAD_FAST_CHECK] = "_LOAD_FAST_CHECK",
+    [_LOAD_FAST_LOAD_FAST] = "_LOAD_FAST_LOAD_FAST",
+    [_LOAD_FROM_DICT_OR_DEREF] = "_LOAD_FROM_DICT_OR_DEREF",
+    [_LOAD_GLOBAL] = "_LOAD_GLOBAL",
+    [_LOAD_GLOBAL_BUILTINS] = "_LOAD_GLOBAL_BUILTINS",
+    [_LOAD_GLOBAL_MODULE] = "_LOAD_GLOBAL_MODULE",
+    [_LOAD_LOCALS] = "_LOAD_LOCALS",
+    [_LOAD_SUPER_ATTR_ATTR] = "_LOAD_SUPER_ATTR_ATTR",
+    [_LOAD_SUPER_ATTR_METHOD] = "_LOAD_SUPER_ATTR_METHOD",
+    [_MAKE_CELL] = "_MAKE_CELL",
+    [_MAKE_FUNCTION] = "_MAKE_FUNCTION",
+    [_MAP_ADD] = "_MAP_ADD",
+    [_MATCH_CLASS] = "_MATCH_CLASS",
+    [_MATCH_KEYS] = "_MATCH_KEYS",
+    [_MATCH_MAPPING] = "_MATCH_MAPPING",
+    [_MATCH_SEQUENCE] = "_MATCH_SEQUENCE",
+    [_NOP] = "_NOP",
+    [_POP_EXCEPT] = "_POP_EXCEPT",
+    [_POP_FRAME] = "_POP_FRAME",
+    [_POP_TOP] = "_POP_TOP",
+    [_POP_TOP_LOAD_CONST_INLINE_BORROW] = "_POP_TOP_LOAD_CONST_INLINE_BORROW",
+    [_PUSH_EXC_INFO] = "_PUSH_EXC_INFO",
+    [_PUSH_FRAME] = "_PUSH_FRAME",
+    [_PUSH_NULL] = "_PUSH_NULL",
+    [_PY_FRAME_GENERAL] = "_PY_FRAME_GENERAL",
+    [_REPLACE_WITH_TRUE] = "_REPLACE_WITH_TRUE",
+    [_RESUME_CHECK] = "_RESUME_CHECK",
+    [_RETURN_GENERATOR] = "_RETURN_GENERATOR",
+    [_SAVE_RETURN_OFFSET] = "_SAVE_RETURN_OFFSET",
+    [_SETUP_ANNOTATIONS] = "_SETUP_ANNOTATIONS",
+    [_SET_ADD] = "_SET_ADD",
+    [_SET_FUNCTION_ATTRIBUTE] = "_SET_FUNCTION_ATTRIBUTE",
+    [_SET_IP] = "_SET_IP",
+    [_SET_UPDATE] = "_SET_UPDATE",
+    [_START_EXECUTOR] = "_START_EXECUTOR",
+    [_STORE_ATTR] = "_STORE_ATTR",
+    [_STORE_ATTR_INSTANCE_VALUE] = "_STORE_ATTR_INSTANCE_VALUE",
+    [_STORE_ATTR_SLOT] = "_STORE_ATTR_SLOT",
+    [_STORE_DEREF] = "_STORE_DEREF",
+    [_STORE_FAST] = "_STORE_FAST",
+    [_STORE_FAST_0] = "_STORE_FAST_0",
+    [_STORE_FAST_1] = "_STORE_FAST_1",
+    [_STORE_FAST_2] = "_STORE_FAST_2",
+    [_STORE_FAST_3] = "_STORE_FAST_3",
+    [_STORE_FAST_4] = "_STORE_FAST_4",
+    [_STORE_FAST_5] = "_STORE_FAST_5",
+    [_STORE_FAST_6] = "_STORE_FAST_6",
+    [_STORE_FAST_7] = "_STORE_FAST_7",
+    [_STORE_FAST_LOAD_FAST] = "_STORE_FAST_LOAD_FAST",
+    [_STORE_FAST_STORE_FAST] = "_STORE_FAST_STORE_FAST",
+    [_STORE_GLOBAL] = "_STORE_GLOBAL",
+    [_STORE_NAME] = "_STORE_NAME",
+    [_STORE_SLICE] = "_STORE_SLICE",
+    [_STORE_SUBSCR] = "_STORE_SUBSCR",
+    [_STORE_SUBSCR_DICT] = "_STORE_SUBSCR_DICT",
+    [_STORE_SUBSCR_LIST_INT] = "_STORE_SUBSCR_LIST_INT",
+    [_SWAP] = "_SWAP",
+    [_TIER2_RESUME_CHECK] = "_TIER2_RESUME_CHECK",
+    [_TO_BOOL] = "_TO_BOOL",
+    [_TO_BOOL_BOOL] = "_TO_BOOL_BOOL",
+    [_TO_BOOL_INT] = "_TO_BOOL_INT",
+    [_TO_BOOL_LIST] = "_TO_BOOL_LIST",
+    [_TO_BOOL_NONE] = "_TO_BOOL_NONE",
+    [_TO_BOOL_STR] = "_TO_BOOL_STR",
+    [_UNARY_INVERT] = "_UNARY_INVERT",
+    [_UNARY_NEGATIVE] = "_UNARY_NEGATIVE",
+    [_UNARY_NOT] = "_UNARY_NOT",
+    [_UNPACK_EX] = "_UNPACK_EX",
+    [_UNPACK_SEQUENCE] = "_UNPACK_SEQUENCE",
+    [_UNPACK_SEQUENCE_LIST] = "_UNPACK_SEQUENCE_LIST",
+    [_UNPACK_SEQUENCE_TUPLE] = "_UNPACK_SEQUENCE_TUPLE",
+    [_UNPACK_SEQUENCE_TWO_TUPLE] = "_UNPACK_SEQUENCE_TWO_TUPLE",
+    [_WITH_EXCEPT_START] = "_WITH_EXCEPT_START",
+    [_YIELD_VALUE] = "_YIELD_VALUE",
+};
+int _PyUop_num_popped(int opcode, int oparg)
+{
+    switch(opcode) {
+        case _NOP:
+            return 0;
+        case _RESUME_CHECK:
+            return 0;
+        case _LOAD_FAST_CHECK:
+            return 0;
+        case _LOAD_FAST_0:
+            return 0;
+        case _LOAD_FAST_1:
+            return 0;
+        case _LOAD_FAST_2:
+            return 0;
+        case _LOAD_FAST_3:
+            return 0;
+        case _LOAD_FAST_4:
+            return 0;
+        case _LOAD_FAST_5:
+            return 0;
+        case _LOAD_FAST_6:
+            return 0;
+        case _LOAD_FAST_7:
+            return 0;
+        case _LOAD_FAST:
+            return 0;
+        case _LOAD_FAST_AND_CLEAR:
+            return 0;
+        case _LOAD_FAST_LOAD_FAST:
+            return 0;
+        case _LOAD_CONST:
+            return 0;
+        case _STORE_FAST_0:
+            return 1;
+        case _STORE_FAST_1:
+            return 1;
+        case _STORE_FAST_2:
+            return 1;
+        case _STORE_FAST_3:
+            return 1;
+        case _STORE_FAST_4:
+            return 1;
+        case _STORE_FAST_5:
+            return 1;
+        case _STORE_FAST_6:
+            return 1;
+        case _STORE_FAST_7:
+            return 1;
+        case _STORE_FAST:
+            return 1;
+        case _STORE_FAST_LOAD_FAST:
+            return 1;
+        case _STORE_FAST_STORE_FAST:
+            return 2;
+        case _POP_TOP:
+            return 1;
+        case _PUSH_NULL:
+            return 0;
+        case _END_SEND:
+            return 2;
+        case _UNARY_NEGATIVE:
+            return 1;
+        case _UNARY_NOT:
+            return 1;
+        case _TO_BOOL:
+            return 1;
+        case _TO_BOOL_BOOL:
+            return 1;
+        case _TO_BOOL_INT:
+            return 1;
+        case _TO_BOOL_LIST:
+            return 1;
+        case _TO_BOOL_NONE:
+            return 1;
+        case _TO_BOOL_STR:
+            return 1;
+        case _REPLACE_WITH_TRUE:
+            return 1;
+        case _UNARY_INVERT:
+            return 1;
+        case _GUARD_BOTH_INT:
+            return 2;
+        case _GUARD_NOS_INT:
+            return 2;
+        case _GUARD_TOS_INT:
+            return 1;
+        case _BINARY_OP_MULTIPLY_INT:
+            return 2;
+        case _BINARY_OP_ADD_INT:
+            return 2;
+        case _BINARY_OP_SUBTRACT_INT:
+            return 2;
+        case _GUARD_BOTH_FLOAT:
+            return 2;
+        case _GUARD_NOS_FLOAT:
+            return 2;
+        case _GUARD_TOS_FLOAT:
+            return 1;
+        case _BINARY_OP_MULTIPLY_FLOAT:
+            return 2;
+        case _BINARY_OP_ADD_FLOAT:
+            return 2;
+        case _BINARY_OP_SUBTRACT_FLOAT:
+            return 2;
+        case _GUARD_BOTH_UNICODE:
+            return 2;
+        case _BINARY_OP_ADD_UNICODE:
+            return 2;
+        case _BINARY_SUBSCR:
+            return 2;
+        case _BINARY_SLICE:
+            return 3;
+        case _STORE_SLICE:
+            return 4;
+        case _BINARY_SUBSCR_LIST_INT:
+            return 2;
+        case _BINARY_SUBSCR_STR_INT:
+            return 2;
+        case _BINARY_SUBSCR_TUPLE_INT:
+            return 2;
+        case _BINARY_SUBSCR_DICT:
+            return 2;
+        case _LIST_APPEND:
+            return 2 + (oparg-1);
+        case _SET_ADD:
+            return 2 + (oparg-1);
+        case _STORE_SUBSCR:
+            return 3;
+        case _STORE_SUBSCR_LIST_INT:
+            return 3;
+        case _STORE_SUBSCR_DICT:
+            return 3;
+        case _DELETE_SUBSCR:
+            return 2;
+        case _CALL_INTRINSIC_1:
+            return 1;
+        case _CALL_INTRINSIC_2:
+            return 2;
+        case _POP_FRAME:
+            return 1;
+        case _GET_AITER:
+            return 1;
+        case _GET_ANEXT:
+            return 1;
+        case _GET_AWAITABLE:
+            return 1;
+        case _YIELD_VALUE:
+            return 1;
+        case _POP_EXCEPT:
+            return 1;
+        case _LOAD_ASSERTION_ERROR:
+            return 0;
+        case _LOAD_BUILD_CLASS:
+            return 0;
+        case _STORE_NAME:
+            return 1;
+        case _DELETE_NAME:
+            return 0;
+        case _UNPACK_SEQUENCE:
+            return 1;
+        case _UNPACK_SEQUENCE_TWO_TUPLE:
+            return 1;
+        case _UNPACK_SEQUENCE_TUPLE:
+            return 1;
+        case _UNPACK_SEQUENCE_LIST:
+            return 1;
+        case _UNPACK_EX:
+            return 1;
+        case _STORE_ATTR:
+            return 2;
+        case _DELETE_ATTR:
+            return 1;
+        case _STORE_GLOBAL:
+            return 1;
+        case _DELETE_GLOBAL:
+            return 0;
+        case _LOAD_LOCALS:
+            return 0;
+        case _LOAD_GLOBAL:
+            return 0;
+        case _GUARD_GLOBALS_VERSION:
+            return 0;
+        case _GUARD_BUILTINS_VERSION:
+            return 0;
+        case _LOAD_GLOBAL_MODULE:
+            return 0;
+        case _LOAD_GLOBAL_BUILTINS:
+            return 0;
+        case _DELETE_FAST:
+            return 0;
+        case _MAKE_CELL:
+            return 0;
+        case _DELETE_DEREF:
+            return 0;
+        case _LOAD_FROM_DICT_OR_DEREF:
+            return 1;
+        case _LOAD_DEREF:
+            return 0;
+        case _STORE_DEREF:
+            return 1;
+        case _COPY_FREE_VARS:
+            return 0;
+        case _BUILD_STRING:
+            return oparg;
+        case _BUILD_TUPLE:
+            return oparg;
+        case _BUILD_LIST:
+            return oparg;
+        case _LIST_EXTEND:
+            return 2 + (oparg-1);
+        case _SET_UPDATE:
+            return 2 + (oparg-1);
+        case _BUILD_MAP:
+            return oparg*2;
+        case _SETUP_ANNOTATIONS:
+            return 0;
+        case _BUILD_CONST_KEY_MAP:
+            return 1 + oparg;
+        case _DICT_UPDATE:
+            return 2 + (oparg - 1);
+        case _DICT_MERGE:
+            return 5 + (oparg - 1);
+        case _MAP_ADD:
+            return 3 + (oparg - 1);
+        case _LOAD_SUPER_ATTR_ATTR:
+            return 3;
+        case _LOAD_SUPER_ATTR_METHOD:
+            return 3;
+        case _LOAD_ATTR:
+            return 1;
+        case _GUARD_TYPE_VERSION:
+            return 1;
+        case _CHECK_MANAGED_OBJECT_HAS_VALUES:
+            return 1;
+        case _LOAD_ATTR_INSTANCE_VALUE_0:
+            return 1;
+        case _LOAD_ATTR_INSTANCE_VALUE_1:
+            return 1;
+        case _LOAD_ATTR_INSTANCE_VALUE:
+            return 1;
+        case _CHECK_ATTR_MODULE:
+            return 1;
+        case _LOAD_ATTR_MODULE:
+            return 1;
+        case _CHECK_ATTR_WITH_HINT:
+            return 1;
+        case _LOAD_ATTR_WITH_HINT:
+            return 1;
+        case _LOAD_ATTR_SLOT_0:
+            return 1;
+        case _LOAD_ATTR_SLOT_1:
+            return 1;
+        case _LOAD_ATTR_SLOT:
+            return 1;
+        case _CHECK_ATTR_CLASS:
+            return 1;
+        case _LOAD_ATTR_CLASS_0:
+            return 1;
+        case _LOAD_ATTR_CLASS_1:
+            return 1;
+        case _LOAD_ATTR_CLASS:
+            return 1;
+        case _GUARD_DORV_NO_DICT:
+            return 1;
+        case _STORE_ATTR_INSTANCE_VALUE:
+            return 2;
+        case _STORE_ATTR_SLOT:
+            return 2;
+        case _COMPARE_OP:
+            return 2;
+        case _COMPARE_OP_FLOAT:
+            return 2;
+        case _COMPARE_OP_INT:
+            return 2;
+        case _COMPARE_OP_STR:
+            return 2;
+        case _IS_OP:
+            return 2;
+        case _CONTAINS_OP:
+            return 2;
+        case _CONTAINS_OP_SET:
+            return 2;
+        case _CONTAINS_OP_DICT:
+            return 2;
+        case _CHECK_EG_MATCH:
+            return 2;
+        case _CHECK_EXC_MATCH:
+            return 2;
+        case _IS_NONE:
+            return 1;
+        case _GET_LEN:
+            return 1;
+        case _MATCH_CLASS:
+            return 3;
+        case _MATCH_MAPPING:
+            return 1;
+        case _MATCH_SEQUENCE:
+            return 1;
+        case _MATCH_KEYS:
+            return 2;
+        case _GET_ITER:
+            return 1;
+        case _GET_YIELD_FROM_ITER:
+            return 1;
+        case _FOR_ITER_TIER_TWO:
+            return 1;
+        case _ITER_CHECK_LIST:
+            return 1;
+        case _GUARD_NOT_EXHAUSTED_LIST:
+            return 1;
+        case _ITER_NEXT_LIST:
+            return 1;
+        case _ITER_CHECK_TUPLE:
+            return 1;
+        case _GUARD_NOT_EXHAUSTED_TUPLE:
+            return 1;
+        case _ITER_NEXT_TUPLE:
+            return 1;
+        case _ITER_CHECK_RANGE:
+            return 1;
+        case _GUARD_NOT_EXHAUSTED_RANGE:
+            return 1;
+        case _ITER_NEXT_RANGE:
+            return 1;
+        case _FOR_ITER_GEN_FRAME:
+            return 1;
+        case _WITH_EXCEPT_START:
+            return 4;
+        case _PUSH_EXC_INFO:
+            return 1;
+        case _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT:
+            return 1;
+        case _GUARD_KEYS_VERSION:
+            return 1;
+        case _LOAD_ATTR_METHOD_WITH_VALUES:
+            return 1;
+        case _LOAD_ATTR_METHOD_NO_DICT:
+            return 1;
+        case _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES:
+            return 1;
+        case _LOAD_ATTR_NONDESCRIPTOR_NO_DICT:
+            return 1;
+        case _CHECK_ATTR_METHOD_LAZY_DICT:
+            return 1;
+        case _LOAD_ATTR_METHOD_LAZY_DICT:
+            return 1;
+        case _CHECK_PERIODIC:
+            return 0;
+        case _PY_FRAME_GENERAL:
+            return 2 + oparg;
+        case _CHECK_FUNCTION_VERSION:
+            return 2 + oparg;
+        case _CHECK_METHOD_VERSION:
+            return 2 + oparg;
+        case _EXPAND_METHOD:
+            return 2 + oparg;
+        case _CHECK_IS_NOT_PY_CALLABLE:
+            return 2 + oparg;
+        case _CALL_NON_PY_GENERAL:
+            return 2 + oparg;
+        case _CHECK_CALL_BOUND_METHOD_EXACT_ARGS:
+            return 2 + oparg;
+        case _INIT_CALL_BOUND_METHOD_EXACT_ARGS:
+            return 2 + oparg;
+        case _CHECK_PEP_523:
+            return 0;
+        case _CHECK_FUNCTION_EXACT_ARGS:
+            return 2 + oparg;
+        case _CHECK_STACK_SPACE:
+            return 2 + oparg;
+        case _INIT_CALL_PY_EXACT_ARGS_0:
+            return 2 + oparg;
+        case _INIT_CALL_PY_EXACT_ARGS_1:
+            return 2 + oparg;
+        case _INIT_CALL_PY_EXACT_ARGS_2:
+            return 2 + oparg;
+        case _INIT_CALL_PY_EXACT_ARGS_3:
+            return 2 + oparg;
+        case _INIT_CALL_PY_EXACT_ARGS_4:
+            return 2 + oparg;
+        case _INIT_CALL_PY_EXACT_ARGS:
+            return 2 + oparg;
+        case _PUSH_FRAME:
+            return 1;
+        case _CALL_TYPE_1:
+            return 3;
+        case _CALL_STR_1:
+            return 3;
+        case _CALL_TUPLE_1:
+            return 3;
+        case _EXIT_INIT_CHECK:
+            return 1;
+        case _CALL_BUILTIN_CLASS:
+            return 2 + oparg;
+        case _CALL_BUILTIN_O:
+            return 2 + oparg;
+        case _CALL_BUILTIN_FAST:
+            return 2 + oparg;
+        case _CALL_BUILTIN_FAST_WITH_KEYWORDS:
+            return 2 + oparg;
+        case _CALL_LEN:
+            return 2 + oparg;
+        case _CALL_ISINSTANCE:
+            return 2 + oparg;
+        case _CALL_METHOD_DESCRIPTOR_O:
+            return 2 + oparg;
+        case _CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS:
+            return 2 + oparg;
+        case _CALL_METHOD_DESCRIPTOR_NOARGS:
+            return 2 + oparg;
+        case _CALL_METHOD_DESCRIPTOR_FAST:
+            return 2 + oparg;
+        case _MAKE_FUNCTION:
+            return 1;
+        case _SET_FUNCTION_ATTRIBUTE:
+            return 2;
+        case _RETURN_GENERATOR:
+            return 0;
+        case _BUILD_SLICE:
+            return 2 + ((oparg == 3) ? 1 : 0);
+        case _CONVERT_VALUE:
+            return 1;
+        case _FORMAT_SIMPLE:
+            return 1;
+        case _FORMAT_WITH_SPEC:
+            return 2;
+        case _COPY:
+            return 1 + (oparg-1);
+        case _BINARY_OP:
+            return 2;
+        case _SWAP:
+            return 2 + (oparg-2);
+        case _GUARD_IS_TRUE_POP:
+            return 1;
+        case _GUARD_IS_FALSE_POP:
+            return 1;
+        case _GUARD_IS_NONE_POP:
+            return 1;
+        case _GUARD_IS_NOT_NONE_POP:
+            return 1;
+        case _JUMP_TO_TOP:
+            return 0;
+        case _SET_IP:
+            return 0;
+        case _CHECK_STACK_SPACE_OPERAND:
+            return 0;
+        case _SAVE_RETURN_OFFSET:
+            return 0;
+        case _EXIT_TRACE:
+            return 0;
+        case _CHECK_VALIDITY:
+            return 0;
+        case _LOAD_CONST_INLINE:
+            return 0;
+        case _LOAD_CONST_INLINE_BORROW:
+            return 0;
+        case _POP_TOP_LOAD_CONST_INLINE_BORROW:
+            return 1;
+        case _LOAD_CONST_INLINE_WITH_NULL:
+            return 0;
+        case _LOAD_CONST_INLINE_BORROW_WITH_NULL:
+            return 0;
+        case _CHECK_FUNCTION:
+            return 0;
+        case _INTERNAL_INCREMENT_OPT_COUNTER:
+            return 1;
+        case _COLD_EXIT:
+            return 0;
+        case _DYNAMIC_EXIT:
+            return 0;
+        case _START_EXECUTOR:
+            return 0;
+        case _FATAL_ERROR:
+            return 0;
+        case _CHECK_VALIDITY_AND_SET_IP:
+            return 0;
+        case _DEOPT:
+            return 0;
+        case _ERROR_POP_N:
+            return oparg;
+        case _TIER2_RESUME_CHECK:
+            return 0;
+        default:
+            return -1;
+    }
+}
+
+#endif // NEED_OPCODE_METADATA
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CORE_UOP_METADATA_H */
diff --git a/Include/internal/pycore_warnings.h b/Include/internal/pycore_warnings.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9f6559312f4ef670c16a16092c788a0ef428d79
--- /dev/null
+++ b/Include/internal/pycore_warnings.h
@@ -0,0 +1,31 @@
+#ifndef Py_INTERNAL_WARNINGS_H
+#define Py_INTERNAL_WARNINGS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+struct _warnings_runtime_state {
+    /* Both 'filters' and 'onceregistry' can be set in warnings.py;
+       get_warnings_attr() will reset these variables accordingly. */
+    PyObject *filters;  /* List */
+    PyObject *once_registry;  /* Dict */
+    PyObject *default_action; /* String */
+    PyMutex mutex;
+    long filters_version;
+};
+
+extern int _PyWarnings_InitState(PyInterpreterState *interp);
+
+extern PyObject* _PyWarnings_Init(void);
+
+extern void _PyErr_WarnUnawaitedCoroutine(PyObject *coro);
+extern void _PyErr_WarnUnawaitedAgenMethod(PyAsyncGenObject *agen, PyObject *method);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_WARNINGS_H */
diff --git a/Include/internal/pycore_weakref.h b/Include/internal/pycore_weakref.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff1395ea837dcb832761afb5ef3c1d4fb430bfb3
--- /dev/null
+++ b/Include/internal/pycore_weakref.h
@@ -0,0 +1,133 @@
+#ifndef Py_INTERNAL_WEAKREF_H
+#define Py_INTERNAL_WEAKREF_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION()
+#include "pycore_lock.h"
+#include "pycore_object.h"           // _Py_REF_IS_MERGED()
+#include "pycore_pyatomic_ft_wrappers.h"
+
+#ifdef Py_GIL_DISABLED
+
+#define WEAKREF_LIST_LOCK(obj) \
+    _PyInterpreterState_GET()  \
+        ->weakref_locks[((uintptr_t)obj) % NUM_WEAKREF_LIST_LOCKS]
+
+// Lock using the referenced object
+#define LOCK_WEAKREFS(obj) \
+    PyMutex_LockFlags(&WEAKREF_LIST_LOCK(obj), _Py_LOCK_DONT_DETACH)
+#define UNLOCK_WEAKREFS(obj) PyMutex_Unlock(&WEAKREF_LIST_LOCK(obj))
+
+// Lock using a weakref
+#define LOCK_WEAKREFS_FOR_WR(wr) \
+    PyMutex_LockFlags(wr->weakrefs_lock, _Py_LOCK_DONT_DETACH)
+#define UNLOCK_WEAKREFS_FOR_WR(wr) PyMutex_Unlock(wr->weakrefs_lock)
+
+#define FT_CLEAR_WEAKREFS(obj, weakref_list)    \
+    do {                                        \
+        assert(Py_REFCNT(obj) == 0);            \
+        PyObject_ClearWeakRefs(obj);            \
+    } while (0)
+
+#else
+
+#define LOCK_WEAKREFS(obj)
+#define UNLOCK_WEAKREFS(obj)
+
+#define LOCK_WEAKREFS_FOR_WR(wr)
+#define UNLOCK_WEAKREFS_FOR_WR(wr)
+
+#define FT_CLEAR_WEAKREFS(obj, weakref_list)        \
+    do {                                            \
+        assert(Py_REFCNT(obj) == 0);                \
+        if (weakref_list != NULL) {                 \
+            PyObject_ClearWeakRefs(obj);            \
+        }                                           \
+    } while (0)
+
+#endif
+
+static inline int _is_dead(PyObject *obj)
+{
+    // Explanation for the Py_REFCNT() check: when a weakref's target is part
+    // of a long chain of deallocations which triggers the trashcan mechanism,
+    // clearing the weakrefs can be delayed long after the target's refcount
+    // has dropped to zero.  In the meantime, code accessing the weakref will
+    // be able to "see" the target object even though it is supposed to be
+    // unreachable.  See issue gh-60806.
+#if defined(Py_GIL_DISABLED)
+    Py_ssize_t shared = _Py_atomic_load_ssize_relaxed(&obj->ob_ref_shared);
+    return shared == _Py_REF_SHARED(0, _Py_REF_MERGED);
+#else
+    return (Py_REFCNT(obj) == 0);
+#endif
+}
+
+static inline PyObject* _PyWeakref_GET_REF(PyObject *ref_obj)
+{
+    assert(PyWeakref_Check(ref_obj));
+    PyWeakReference *ref = _Py_CAST(PyWeakReference*, ref_obj);
+
+    PyObject *obj = FT_ATOMIC_LOAD_PTR(ref->wr_object);
+    if (obj == Py_None) {
+        // clear_weakref() was called
+        return NULL;
+    }
+
+    LOCK_WEAKREFS(obj);
+#ifdef Py_GIL_DISABLED
+    if (ref->wr_object == Py_None) {
+        // clear_weakref() was called
+        UNLOCK_WEAKREFS(obj);
+        return NULL;
+    }
+#endif
+    if (_Py_TryIncref(obj)) {
+        UNLOCK_WEAKREFS(obj);
+        return obj;
+    }
+    UNLOCK_WEAKREFS(obj);
+    return NULL;
+}
+
+static inline int _PyWeakref_IS_DEAD(PyObject *ref_obj)
+{
+    assert(PyWeakref_Check(ref_obj));
+    int ret = 0;
+    PyWeakReference *ref = _Py_CAST(PyWeakReference*, ref_obj);
+    PyObject *obj = FT_ATOMIC_LOAD_PTR(ref->wr_object);
+    if (obj == Py_None) {
+        // clear_weakref() was called
+        ret = 1;
+    }
+    else {
+        LOCK_WEAKREFS(obj);
+        // See _PyWeakref_GET_REF() for the rationale of this test
+#ifdef Py_GIL_DISABLED
+        ret = (ref->wr_object == Py_None) || _is_dead(obj);
+#else
+        ret = _is_dead(obj);
+#endif
+        UNLOCK_WEAKREFS(obj);
+    }
+    return ret;
+}
+
+extern Py_ssize_t _PyWeakref_GetWeakrefCount(PyObject *obj);
+
+// Clear all the weak references to obj but leave their callbacks uncalled and
+// intact.
+extern void _PyWeakref_ClearWeakRefsNoCallbacks(PyObject *obj);
+
+PyAPI_FUNC(int) _PyWeakref_IsDead(PyObject *weakref);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_WEAKREF_H */
diff --git a/Include/intrcheck.h b/Include/intrcheck.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d1feee83de483033212e158aa4a131b2cc17cd8
--- /dev/null
+++ b/Include/intrcheck.h
@@ -0,0 +1,23 @@
+#ifndef Py_INTRCHECK_H
+#define Py_INTRCHECK_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(int) PyOS_InterruptOccurred(void);
+
+#ifdef HAVE_FORK
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+PyAPI_FUNC(void) PyOS_BeforeFork(void);
+PyAPI_FUNC(void) PyOS_AfterFork_Parent(void);
+PyAPI_FUNC(void) PyOS_AfterFork_Child(void);
+#endif
+#endif
+
+/* Deprecated, please use PyOS_AfterFork_Child() instead */
+Py_DEPRECATED(3.7) PyAPI_FUNC(void) PyOS_AfterFork(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTRCHECK_H */
diff --git a/Include/iterobject.h b/Include/iterobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..e69d09719bb4d12be2e22142b35ddaf3b0afb707
--- /dev/null
+++ b/Include/iterobject.h
@@ -0,0 +1,24 @@
+#ifndef Py_ITEROBJECT_H
+#define Py_ITEROBJECT_H
+/* Iterators (the basic kind, over a sequence) */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PySeqIter_Type;
+PyAPI_DATA(PyTypeObject) PyCallIter_Type;
+
+#define PySeqIter_Check(op) Py_IS_TYPE((op), &PySeqIter_Type)
+
+PyAPI_FUNC(PyObject *) PySeqIter_New(PyObject *);
+
+
+#define PyCallIter_Check(op) Py_IS_TYPE((op), &PyCallIter_Type)
+
+PyAPI_FUNC(PyObject *) PyCallIter_New(PyObject *, PyObject *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_ITEROBJECT_H */
+
diff --git a/Include/listobject.h b/Include/listobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1e059b0ba7466c4a6d6fd54e5fe70bd99f4e8c5
--- /dev/null
+++ b/Include/listobject.h
@@ -0,0 +1,55 @@
+/* List object interface
+
+   Another generally useful object type is a list of object pointers.
+   This is a mutable type: the list items can be changed, and items can be
+   added or removed. Out-of-range indices or non-list objects are ignored.
+
+   WARNING: PyList_SetItem does not increment the new item's reference count,
+   but does decrement the reference count of the item it replaces, if not nil.
+   It does *decrement* the reference count if it is *not* inserted in the list.
+   Similarly, PyList_GetItem does not increment the returned item's reference
+   count.
+*/
+
+#ifndef Py_LISTOBJECT_H
+#define Py_LISTOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyList_Type;
+PyAPI_DATA(PyTypeObject) PyListIter_Type;
+PyAPI_DATA(PyTypeObject) PyListRevIter_Type;
+
+#define PyList_Check(op) \
+    PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_LIST_SUBCLASS)
+#define PyList_CheckExact(op) Py_IS_TYPE((op), &PyList_Type)
+
+PyAPI_FUNC(PyObject *) PyList_New(Py_ssize_t size);
+PyAPI_FUNC(Py_ssize_t) PyList_Size(PyObject *);
+
+PyAPI_FUNC(PyObject *) PyList_GetItem(PyObject *, Py_ssize_t);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+PyAPI_FUNC(PyObject *) PyList_GetItemRef(PyObject *, Py_ssize_t);
+#endif
+PyAPI_FUNC(int) PyList_SetItem(PyObject *, Py_ssize_t, PyObject *);
+PyAPI_FUNC(int) PyList_Insert(PyObject *, Py_ssize_t, PyObject *);
+PyAPI_FUNC(int) PyList_Append(PyObject *, PyObject *);
+
+PyAPI_FUNC(PyObject *) PyList_GetSlice(PyObject *, Py_ssize_t, Py_ssize_t);
+PyAPI_FUNC(int) PyList_SetSlice(PyObject *, Py_ssize_t, Py_ssize_t, PyObject *);
+
+PyAPI_FUNC(int) PyList_Sort(PyObject *);
+PyAPI_FUNC(int) PyList_Reverse(PyObject *);
+PyAPI_FUNC(PyObject *) PyList_AsTuple(PyObject *);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_LISTOBJECT_H
+#  include "cpython/listobject.h"
+#  undef Py_CPYTHON_LISTOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_LISTOBJECT_H */
diff --git a/Include/lock.h b/Include/lock.h
new file mode 100644
index 0000000000000000000000000000000000000000..782b9dbc70d056994c6bbd90d6a5606ef57369d2
--- /dev/null
+++ b/Include/lock.h
@@ -0,0 +1,16 @@
+#ifndef Py_LOCK_H
+#define Py_LOCK_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_LOCK_H
+#  include "cpython/lock.h"
+#  undef Py_CPYTHON_LOCK_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_LOCK_H */
diff --git a/Include/longobject.h b/Include/longobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..19104cd9d1bef9708e7d5e17bfc11dc5bab254bf
--- /dev/null
+++ b/Include/longobject.h
@@ -0,0 +1,114 @@
+#ifndef Py_LONGOBJECT_H
+#define Py_LONGOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Long (arbitrary precision) integer object interface */
+
+// PyLong_Type is declared by object.h
+
+#define PyLong_Check(op) \
+        PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_LONG_SUBCLASS)
+#define PyLong_CheckExact(op) Py_IS_TYPE((op), &PyLong_Type)
+
+PyAPI_FUNC(PyObject *) PyLong_FromLong(long);
+PyAPI_FUNC(PyObject *) PyLong_FromUnsignedLong(unsigned long);
+PyAPI_FUNC(PyObject *) PyLong_FromSize_t(size_t);
+PyAPI_FUNC(PyObject *) PyLong_FromSsize_t(Py_ssize_t);
+PyAPI_FUNC(PyObject *) PyLong_FromDouble(double);
+
+PyAPI_FUNC(long) PyLong_AsLong(PyObject *);
+PyAPI_FUNC(long) PyLong_AsLongAndOverflow(PyObject *, int *);
+PyAPI_FUNC(Py_ssize_t) PyLong_AsSsize_t(PyObject *);
+PyAPI_FUNC(size_t) PyLong_AsSize_t(PyObject *);
+PyAPI_FUNC(unsigned long) PyLong_AsUnsignedLong(PyObject *);
+PyAPI_FUNC(unsigned long) PyLong_AsUnsignedLongMask(PyObject *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+PyAPI_FUNC(int) PyLong_AsInt(PyObject *);
+#endif
+
+PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
+
+/* It may be useful in the future. I've added it in the PyInt -> PyLong
+   cleanup to keep the extra information. [CH] */
+#define PyLong_AS_LONG(op) PyLong_AsLong(op)
+
+/* Issue #1983: pid_t can be longer than a C long on some systems */
+#if !defined(SIZEOF_PID_T) || SIZEOF_PID_T == SIZEOF_INT
+#define _Py_PARSE_PID "i"
+#define PyLong_FromPid PyLong_FromLong
+# if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+#   define PyLong_AsPid PyLong_AsInt
+# elif SIZEOF_INT == SIZEOF_LONG
+#   define PyLong_AsPid PyLong_AsLong
+# else
+static inline int
+PyLong_AsPid(PyObject *obj)
+{
+    int overflow;
+    long result = PyLong_AsLongAndOverflow(obj, &overflow);
+    if (overflow || result > INT_MAX || result < INT_MIN) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "Python int too large to convert to C int");
+        return -1;
+    }
+    return (int)result;
+}
+# endif
+#elif SIZEOF_PID_T == SIZEOF_LONG
+#define _Py_PARSE_PID "l"
+#define PyLong_FromPid PyLong_FromLong
+#define PyLong_AsPid PyLong_AsLong
+#elif defined(SIZEOF_LONG_LONG) && SIZEOF_PID_T == SIZEOF_LONG_LONG
+#define _Py_PARSE_PID "L"
+#define PyLong_FromPid PyLong_FromLongLong
+#define PyLong_AsPid PyLong_AsLongLong
+#else
+#error "sizeof(pid_t) is neither sizeof(int), sizeof(long) or sizeof(long long)"
+#endif /* SIZEOF_PID_T */
+
+#if SIZEOF_VOID_P == SIZEOF_INT
+#  define _Py_PARSE_INTPTR "i"
+#  define _Py_PARSE_UINTPTR "I"
+#elif SIZEOF_VOID_P == SIZEOF_LONG
+#  define _Py_PARSE_INTPTR "l"
+#  define _Py_PARSE_UINTPTR "k"
+#elif defined(SIZEOF_LONG_LONG) && SIZEOF_VOID_P == SIZEOF_LONG_LONG
+#  define _Py_PARSE_INTPTR "L"
+#  define _Py_PARSE_UINTPTR "K"
+#else
+#  error "void* different in size from int, long and long long"
+#endif /* SIZEOF_VOID_P */
+
+PyAPI_FUNC(double) PyLong_AsDouble(PyObject *);
+PyAPI_FUNC(PyObject *) PyLong_FromVoidPtr(void *);
+PyAPI_FUNC(void *) PyLong_AsVoidPtr(PyObject *);
+
+PyAPI_FUNC(PyObject *) PyLong_FromLongLong(long long);
+PyAPI_FUNC(PyObject *) PyLong_FromUnsignedLongLong(unsigned long long);
+PyAPI_FUNC(long long) PyLong_AsLongLong(PyObject *);
+PyAPI_FUNC(unsigned long long) PyLong_AsUnsignedLongLong(PyObject *);
+PyAPI_FUNC(unsigned long long) PyLong_AsUnsignedLongLongMask(PyObject *);
+PyAPI_FUNC(long long) PyLong_AsLongLongAndOverflow(PyObject *, int *);
+
+PyAPI_FUNC(PyObject *) PyLong_FromString(const char *, char **, int);
+
+/* These aren't really part of the int object, but they're handy. The
+   functions are in Python/mystrtoul.c.
+ */
+PyAPI_FUNC(unsigned long) PyOS_strtoul(const char *, char **, int);
+PyAPI_FUNC(long) PyOS_strtol(const char *, char **, int);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_LONGOBJECT_H
+#  include "cpython/longobject.h"
+#  undef Py_CPYTHON_LONGOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_LONGOBJECT_H */
diff --git a/Include/marshal.h b/Include/marshal.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8b0de80cfc38df628992918fc1e6cd19cdf4a6b
--- /dev/null
+++ b/Include/marshal.h
@@ -0,0 +1,31 @@
+
+/* Interface for marshal.c */
+
+#ifndef Py_MARSHAL_H
+#define Py_MARSHAL_H
+#ifndef Py_LIMITED_API
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(PyObject *) PyMarshal_ReadObjectFromString(const char *,
+                                                      Py_ssize_t);
+PyAPI_FUNC(PyObject *) PyMarshal_WriteObjectToString(PyObject *, int);
+
+#define Py_MARSHAL_VERSION 4
+
+PyAPI_FUNC(long) PyMarshal_ReadLongFromFile(FILE *);
+PyAPI_FUNC(int) PyMarshal_ReadShortFromFile(FILE *);
+PyAPI_FUNC(PyObject *) PyMarshal_ReadObjectFromFile(FILE *);
+PyAPI_FUNC(PyObject *) PyMarshal_ReadLastObjectFromFile(FILE *);
+
+PyAPI_FUNC(void) PyMarshal_WriteLongToFile(long, FILE *, int);
+PyAPI_FUNC(void) PyMarshal_WriteObjectToFile(PyObject *, FILE *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* Py_LIMITED_API */
+#endif /* !Py_MARSHAL_H */
diff --git a/Include/memoryobject.h b/Include/memoryobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c9146aa2b5b06ee2e5c54ec03f180a970e33a72
--- /dev/null
+++ b/Include/memoryobject.h
@@ -0,0 +1,34 @@
+/* Memory view object. In Python this is available as "memoryview". */
+
+#ifndef Py_MEMORYOBJECT_H
+#define Py_MEMORYOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyMemoryView_Type;
+
+#define PyMemoryView_Check(op) Py_IS_TYPE((op), &PyMemoryView_Type)
+
+PyAPI_FUNC(PyObject *) PyMemoryView_FromObject(PyObject *base);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyMemoryView_FromMemory(char *mem, Py_ssize_t size,
+                                               int flags);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030b0000
+PyAPI_FUNC(PyObject *) PyMemoryView_FromBuffer(const Py_buffer *info);
+#endif
+PyAPI_FUNC(PyObject *) PyMemoryView_GetContiguous(PyObject *base,
+                                                  int buffertype,
+                                                  char order);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_MEMORYOBJECT_H
+#  include "cpython/memoryobject.h"
+#  undef Py_CPYTHON_MEMORYOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_MEMORYOBJECT_H */
diff --git a/Include/methodobject.h b/Include/methodobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..39272815b127f4e8c394cc95ce6aa17464f5b962
--- /dev/null
+++ b/Include/methodobject.h
@@ -0,0 +1,137 @@
+
+/* Method object interface */
+
+#ifndef Py_METHODOBJECT_H
+#define Py_METHODOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This is about the type 'builtin_function_or_method',
+   not Python methods in user-defined classes.  See classobject.h
+   for the latter. */
+
+PyAPI_DATA(PyTypeObject) PyCFunction_Type;
+
+#define PyCFunction_CheckExact(op) Py_IS_TYPE((op), &PyCFunction_Type)
+#define PyCFunction_Check(op) PyObject_TypeCheck((op), &PyCFunction_Type)
+
+typedef PyObject *(*PyCFunction)(PyObject *, PyObject *);
+typedef PyObject *(*PyCFunctionFast) (PyObject *, PyObject *const *, Py_ssize_t);
+typedef PyObject *(*PyCFunctionWithKeywords)(PyObject *, PyObject *,
+                                             PyObject *);
+typedef PyObject *(*PyCFunctionFastWithKeywords) (PyObject *,
+                                                  PyObject *const *, Py_ssize_t,
+                                                  PyObject *);
+typedef PyObject *(*PyCMethod)(PyObject *, PyTypeObject *, PyObject *const *,
+                               size_t, PyObject *);
+
+// For backwards compatibility. `METH_FASTCALL` was added to the stable API in
+// 3.10 alongside `_PyCFunctionFastWithKeywords` and `_PyCFunctionFast`.
+// Note that the underscore-prefixed names were documented in public docs;
+// people may be using them.
+typedef PyCFunctionFast _PyCFunctionFast;
+typedef PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords;
+
+// Cast an function to the PyCFunction type to use it with PyMethodDef.
+//
+// This macro can be used to prevent compiler warnings if the first parameter
+// uses a different pointer type than PyObject* (ex: METH_VARARGS and METH_O
+// calling conventions).
+//
+// The macro can also be used for METH_FASTCALL and METH_VARARGS|METH_KEYWORDS
+// calling conventions to avoid compiler warnings because the function has more
+// than 2 parameters. The macro first casts the function to the
+// "void func(void)" type to prevent compiler warnings.
+//
+// If a function is declared with the METH_NOARGS calling convention, it must
+// have 2 parameters. Since the second parameter is unused, Py_UNUSED() can be
+// used to prevent a compiler warning. If the function has a single parameter,
+// it triggers an undefined behavior when Python calls it with 2 parameters
+// (bpo-33012).
+#define _PyCFunction_CAST(func) \
+    _Py_CAST(PyCFunction, _Py_CAST(void(*)(void), (func)))
+
+PyAPI_FUNC(PyCFunction) PyCFunction_GetFunction(PyObject *);
+PyAPI_FUNC(PyObject *) PyCFunction_GetSelf(PyObject *);
+PyAPI_FUNC(int) PyCFunction_GetFlags(PyObject *);
+
+struct PyMethodDef {
+    const char  *ml_name;   /* The name of the built-in function/method */
+    PyCFunction ml_meth;    /* The C function that implements it */
+    int         ml_flags;   /* Combination of METH_xxx flags, which mostly
+                               describe the args expected by the C func */
+    const char  *ml_doc;    /* The __doc__ attribute, or NULL */
+};
+
+/* PyCFunction_New is declared as a function for stable ABI (declaration is
+ * needed for e.g. GCC with -fvisibility=hidden), but redefined as a macro
+ * that calls PyCFunction_NewEx. */
+PyAPI_FUNC(PyObject *) PyCFunction_New(PyMethodDef *, PyObject *);
+#define PyCFunction_New(ML, SELF) PyCFunction_NewEx((ML), (SELF), NULL)
+
+/* PyCFunction_NewEx is similar: on 3.9+, this calls PyCMethod_New. */
+PyAPI_FUNC(PyObject *) PyCFunction_NewEx(PyMethodDef *, PyObject *,
+                                         PyObject *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+#define PyCFunction_NewEx(ML, SELF, MOD) PyCMethod_New((ML), (SELF), (MOD), NULL)
+PyAPI_FUNC(PyObject *) PyCMethod_New(PyMethodDef *, PyObject *,
+                                     PyObject *, PyTypeObject *);
+#endif
+
+
+/* Flag passed to newmethodobject */
+/* #define METH_OLDARGS  0x0000   -- unsupported now */
+#define METH_VARARGS  0x0001
+#define METH_KEYWORDS 0x0002
+/* METH_NOARGS and METH_O must not be combined with the flags above. */
+#define METH_NOARGS   0x0004
+#define METH_O        0x0008
+
+/* METH_CLASS and METH_STATIC are a little different; these control
+   the construction of methods for a class.  These cannot be used for
+   functions in modules. */
+#define METH_CLASS    0x0010
+#define METH_STATIC   0x0020
+
+/* METH_COEXIST allows a method to be entered even though a slot has
+   already filled the entry.  When defined, the flag allows a separate
+   method, "__contains__" for example, to coexist with a defined
+   slot like sq_contains. */
+
+#define METH_COEXIST   0x0040
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030a0000
+#  define METH_FASTCALL  0x0080
+#endif
+
+/* This bit is preserved for Stackless Python */
+#ifdef STACKLESS
+#  define METH_STACKLESS 0x0100
+#else
+#  define METH_STACKLESS 0x0000
+#endif
+
+/* METH_METHOD means the function stores an
+ * additional reference to the class that defines it;
+ * both self and class are passed to it.
+ * It uses PyCMethodObject instead of PyCFunctionObject.
+ * May not be combined with METH_NOARGS, METH_O, METH_CLASS or METH_STATIC.
+ */
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+#define METH_METHOD 0x0200
+#endif
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_METHODOBJECT_H
+#  include "cpython/methodobject.h"
+#  undef Py_CPYTHON_METHODOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_METHODOBJECT_H */
diff --git a/Include/modsupport.h b/Include/modsupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..af995f567b004c9a288f57118e16dab74be47878
--- /dev/null
+++ b/Include/modsupport.h
@@ -0,0 +1,146 @@
+// Module support interface
+
+#ifndef Py_MODSUPPORT_H
+#define Py_MODSUPPORT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(int) PyArg_Parse(PyObject *, const char *, ...);
+PyAPI_FUNC(int) PyArg_ParseTuple(PyObject *, const char *, ...);
+PyAPI_FUNC(int) PyArg_ParseTupleAndKeywords(PyObject *, PyObject *,
+                                            const char *, PY_CXX_CONST char * const *, ...);
+PyAPI_FUNC(int) PyArg_VaParse(PyObject *, const char *, va_list);
+PyAPI_FUNC(int) PyArg_VaParseTupleAndKeywords(PyObject *, PyObject *,
+                                              const char *, PY_CXX_CONST char * const *, va_list);
+
+PyAPI_FUNC(int) PyArg_ValidateKeywordArguments(PyObject *);
+PyAPI_FUNC(int) PyArg_UnpackTuple(PyObject *, const char *, Py_ssize_t, Py_ssize_t, ...);
+PyAPI_FUNC(PyObject *) Py_BuildValue(const char *, ...);
+PyAPI_FUNC(PyObject *) Py_VaBuildValue(const char *, va_list);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030a0000
+// Add an attribute with name 'name' and value 'obj' to the module 'mod.
+// On success, return 0.
+// On error, raise an exception and return -1.
+PyAPI_FUNC(int) PyModule_AddObjectRef(PyObject *mod, const char *name, PyObject *value);
+#endif   /* Py_LIMITED_API */
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+// Similar to PyModule_AddObjectRef() but steal a reference to 'value'.
+PyAPI_FUNC(int) PyModule_Add(PyObject *mod, const char *name, PyObject *value);
+#endif   /* Py_LIMITED_API */
+
+// Similar to PyModule_AddObjectRef() and PyModule_Add() but steal
+// a reference to 'value' on success and only on success.
+// Errorprone. Should not be used in new code.
+PyAPI_FUNC(int) PyModule_AddObject(PyObject *mod, const char *, PyObject *value);
+
+PyAPI_FUNC(int) PyModule_AddIntConstant(PyObject *, const char *, long);
+PyAPI_FUNC(int) PyModule_AddStringConstant(PyObject *, const char *, const char *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+/* New in 3.9 */
+PyAPI_FUNC(int) PyModule_AddType(PyObject *module, PyTypeObject *type);
+#endif /* Py_LIMITED_API */
+
+#define PyModule_AddIntMacro(m, c) PyModule_AddIntConstant((m), #c, (c))
+#define PyModule_AddStringMacro(m, c) PyModule_AddStringConstant((m), #c, (c))
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* New in 3.5 */
+PyAPI_FUNC(int) PyModule_SetDocString(PyObject *, const char *);
+PyAPI_FUNC(int) PyModule_AddFunctions(PyObject *, PyMethodDef *);
+PyAPI_FUNC(int) PyModule_ExecDef(PyObject *module, PyModuleDef *def);
+#endif
+
+#define Py_CLEANUP_SUPPORTED 0x20000
+
+#define PYTHON_API_VERSION 1013
+#define PYTHON_API_STRING "1013"
+/* The API version is maintained (independently from the Python version)
+   so we can detect mismatches between the interpreter and dynamically
+   loaded modules.  These are diagnosed by an error message but
+   the module is still loaded (because the mismatch can only be tested
+   after loading the module).  The error message is intended to
+   explain the core dump a few seconds later.
+
+   The symbol PYTHON_API_STRING defines the same value as a string
+   literal.  *** PLEASE MAKE SURE THE DEFINITIONS MATCH. ***
+
+   Please add a line or two to the top of this log for each API
+   version change:
+
+   22-Feb-2006  MvL     1013    PEP 353 - long indices for sequence lengths
+
+   19-Aug-2002  GvR     1012    Changes to string object struct for
+                                interning changes, saving 3 bytes.
+
+   17-Jul-2001  GvR     1011    Descr-branch, just to be on the safe side
+
+   25-Jan-2001  FLD     1010    Parameters added to PyCode_New() and
+                                PyFrame_New(); Python 2.1a2
+
+   14-Mar-2000  GvR     1009    Unicode API added
+
+   3-Jan-1999   GvR     1007    Decided to change back!  (Don't reuse 1008!)
+
+   3-Dec-1998   GvR     1008    Python 1.5.2b1
+
+   18-Jan-1997  GvR     1007    string interning and other speedups
+
+   11-Oct-1996  GvR     renamed Py_Ellipses to Py_Ellipsis :-(
+
+   30-Jul-1996  GvR     Slice and ellipses syntax added
+
+   23-Jul-1996  GvR     For 1.4 -- better safe than sorry this time :-)
+
+   7-Nov-1995   GvR     Keyword arguments (should've been done at 1.3 :-( )
+
+   10-Jan-1995  GvR     Renamed globals to new naming scheme
+
+   9-Jan-1995   GvR     Initial version (incompatible with older API)
+*/
+
+/* The PYTHON_ABI_VERSION is introduced in PEP 384. For the lifetime of
+   Python 3, it will stay at the value of 3; changes to the limited API
+   must be performed in a strictly backwards-compatible manner. */
+#define PYTHON_ABI_VERSION 3
+#define PYTHON_ABI_STRING "3"
+
+PyAPI_FUNC(PyObject *) PyModule_Create2(PyModuleDef*, int apiver);
+
+#ifdef Py_LIMITED_API
+#define PyModule_Create(module) \
+        PyModule_Create2((module), PYTHON_ABI_VERSION)
+#else
+#define PyModule_Create(module) \
+        PyModule_Create2((module), PYTHON_API_VERSION)
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* New in 3.5 */
+PyAPI_FUNC(PyObject *) PyModule_FromDefAndSpec2(PyModuleDef *def,
+                                                PyObject *spec,
+                                                int module_api_version);
+
+#ifdef Py_LIMITED_API
+#define PyModule_FromDefAndSpec(module, spec) \
+    PyModule_FromDefAndSpec2((module), (spec), PYTHON_ABI_VERSION)
+#else
+#define PyModule_FromDefAndSpec(module, spec) \
+    PyModule_FromDefAndSpec2((module), (spec), PYTHON_API_VERSION)
+#endif /* Py_LIMITED_API */
+
+#endif /* New in 3.5 */
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_MODSUPPORT_H
+#  include "cpython/modsupport.h"
+#  undef Py_CPYTHON_MODSUPPORT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_MODSUPPORT_H */
diff --git a/Include/moduleobject.h b/Include/moduleobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a17c891dda811d4c407c7f2abe44ce89db7c5d8
--- /dev/null
+++ b/Include/moduleobject.h
@@ -0,0 +1,122 @@
+
+/* Module object interface */
+
+#ifndef Py_MODULEOBJECT_H
+#define Py_MODULEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyModule_Type;
+
+#define PyModule_Check(op) PyObject_TypeCheck((op), &PyModule_Type)
+#define PyModule_CheckExact(op) Py_IS_TYPE((op), &PyModule_Type)
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyModule_NewObject(
+    PyObject *name
+    );
+#endif
+PyAPI_FUNC(PyObject *) PyModule_New(
+    const char *name            /* UTF-8 encoded string */
+    );
+PyAPI_FUNC(PyObject *) PyModule_GetDict(PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyModule_GetNameObject(PyObject *);
+#endif
+PyAPI_FUNC(const char *) PyModule_GetName(PyObject *);
+Py_DEPRECATED(3.2) PyAPI_FUNC(const char *) PyModule_GetFilename(PyObject *);
+PyAPI_FUNC(PyObject *) PyModule_GetFilenameObject(PyObject *);
+PyAPI_FUNC(PyModuleDef*) PyModule_GetDef(PyObject*);
+PyAPI_FUNC(void*) PyModule_GetState(PyObject*);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* New in 3.5 */
+PyAPI_FUNC(PyObject *) PyModuleDef_Init(PyModuleDef*);
+PyAPI_DATA(PyTypeObject) PyModuleDef_Type;
+#endif
+
+typedef struct PyModuleDef_Base {
+  PyObject_HEAD
+  /* The function used to re-initialize the module.
+     This is only set for legacy (single-phase init) extension modules
+     and only used for those that support multiple initializations
+     (m_size >= 0).
+     It is set by _PyImport_LoadDynamicModuleWithSpec()
+     and _imp.create_builtin(). */
+  PyObject* (*m_init)(void);
+  /* The module's index into its interpreter's modules_by_index cache.
+     This is set for all extension modules but only used for legacy ones.
+     (See PyInterpreterState.modules_by_index for more info.)
+     It is set by PyModuleDef_Init(). */
+  Py_ssize_t m_index;
+  /* A copy of the module's __dict__ after the first time it was loaded.
+     This is only set/used for legacy modules that do not support
+     multiple initializations.
+     It is set by fix_up_extension() in import.c. */
+  PyObject* m_copy;
+} PyModuleDef_Base;
+
+#define PyModuleDef_HEAD_INIT {  \
+    PyObject_HEAD_INIT(_Py_NULL) \
+    _Py_NULL, /* m_init */       \
+    0,        /* m_index */      \
+    _Py_NULL, /* m_copy */       \
+  }
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* New in 3.5 */
+struct PyModuleDef_Slot {
+    int slot;
+    void *value;
+};
+
+#define Py_mod_create 1
+#define Py_mod_exec 2
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030c0000
+#  define Py_mod_multiple_interpreters 3
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+#  define Py_mod_gil 4
+#endif
+
+
+#ifndef Py_LIMITED_API
+#define _Py_mod_LAST_SLOT 4
+#endif
+
+#endif /* New in 3.5 */
+
+/* for Py_mod_multiple_interpreters: */
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030c0000
+#  define Py_MOD_MULTIPLE_INTERPRETERS_NOT_SUPPORTED ((void *)0)
+#  define Py_MOD_MULTIPLE_INTERPRETERS_SUPPORTED ((void *)1)
+#  define Py_MOD_PER_INTERPRETER_GIL_SUPPORTED ((void *)2)
+#endif
+
+/* for Py_mod_gil: */
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+#  define Py_MOD_GIL_USED ((void *)0)
+#  define Py_MOD_GIL_NOT_USED ((void *)1)
+#endif
+
+#if !defined(Py_LIMITED_API) && defined(Py_GIL_DISABLED)
+PyAPI_FUNC(int) PyUnstable_Module_SetGIL(PyObject *module, void *gil);
+#endif
+
+struct PyModuleDef {
+  PyModuleDef_Base m_base;
+  const char* m_name;
+  const char* m_doc;
+  Py_ssize_t m_size;
+  PyMethodDef *m_methods;
+  PyModuleDef_Slot *m_slots;
+  traverseproc m_traverse;
+  inquiry m_clear;
+  freefunc m_free;
+};
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_MODULEOBJECT_H */
diff --git a/Include/monitoring.h b/Include/monitoring.h
new file mode 100644
index 0000000000000000000000000000000000000000..985f7f230e44e3df89ff53efbef604ca298f09f0
--- /dev/null
+++ b/Include/monitoring.h
@@ -0,0 +1,18 @@
+#ifndef Py_MONITORING_H
+#define Py_MONITORING_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// There is currently no limited API for monitoring
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_MONITORING_H
+#  include "cpython/monitoring.h"
+#  undef Py_CPYTHON_MONITORING_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_MONITORING_H */
diff --git a/Include/object.h b/Include/object.h
new file mode 100644
index 0000000000000000000000000000000000000000..0266d25230a2b423c5f5034386188a8676e2a510
--- /dev/null
+++ b/Include/object.h
@@ -0,0 +1,1275 @@
+#ifndef Py_OBJECT_H
+#define Py_OBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Object and type object interface */
+
+/*
+Objects are structures allocated on the heap.  Special rules apply to
+the use of objects to ensure they are properly garbage-collected.
+Objects are never allocated statically or on the stack; they must be
+accessed through special macros and functions only.  (Type objects are
+exceptions to the first rule; the standard types are represented by
+statically initialized type objects, although work on type/class unification
+for Python 2.2 made it possible to have heap-allocated type objects too).
+
+An object has a 'reference count' that is increased or decreased when a
+pointer to the object is copied or deleted; when the reference count
+reaches zero there are no references to the object left and it can be
+removed from the heap.
+
+An object has a 'type' that determines what it represents and what kind
+of data it contains.  An object's type is fixed when it is created.
+Types themselves are represented as objects; an object contains a
+pointer to the corresponding type object.  The type itself has a type
+pointer pointing to the object representing the type 'type', which
+contains a pointer to itself!.
+
+Objects do not float around in memory; once allocated an object keeps
+the same size and address.  Objects that must hold variable-size data
+can contain pointers to variable-size parts of the object.  Not all
+objects of the same type have the same size; but the size cannot change
+after allocation.  (These restrictions are made so a reference to an
+object can be simply a pointer -- moving an object would require
+updating all the pointers, and changing an object's size would require
+moving it if there was another object right next to it.)
+
+Objects are always accessed through pointers of the type 'PyObject *'.
+The type 'PyObject' is a structure that only contains the reference count
+and the type pointer.  The actual memory allocated for an object
+contains other data that can only be accessed after casting the pointer
+to a pointer to a longer structure type.  This longer type must start
+with the reference count and type fields; the macro PyObject_HEAD should be
+used for this (to accommodate for future changes).  The implementation
+of a particular object type can cast the object pointer to the proper
+type and back.
+
+A standard interface exists for objects that contain an array of items
+whose size is determined when the object is allocated.
+*/
+
+/* Py_DEBUG implies Py_REF_DEBUG. */
+#if defined(Py_DEBUG) && !defined(Py_REF_DEBUG)
+#  define Py_REF_DEBUG
+#endif
+
+/* PyObject_HEAD defines the initial segment of every PyObject. */
+#define PyObject_HEAD                   PyObject ob_base;
+
+/*
+Immortalization:
+
+The following indicates the immortalization strategy depending on the amount
+of available bits in the reference count field. All strategies are backwards
+compatible but the specific reference count value or immortalization check
+might change depending on the specializations for the underlying system.
+
+Proper deallocation of immortal instances requires distinguishing between
+statically allocated immortal instances vs those promoted by the runtime to be
+immortal. The latter should be the only instances that require
+cleanup during runtime finalization.
+*/
+
+#if SIZEOF_VOID_P > 4
+/*
+In 64+ bit systems, an object will be marked as immortal by setting all of the
+lower 32 bits of the reference count field, which is equal to: 0xFFFFFFFF
+
+Using the lower 32 bits makes the value backwards compatible by allowing
+C-Extensions without the updated checks in Py_INCREF and Py_DECREF to safely
+increase and decrease the objects reference count. The object would lose its
+immortality, but the execution would still be correct.
+
+Reference count increases will use saturated arithmetic, taking advantage of
+having all the lower 32 bits set, which will avoid the reference count to go
+beyond the refcount limit. Immortality checks for reference count decreases will
+be done by checking the bit sign flag in the lower 32 bits.
+*/
+#define _Py_IMMORTAL_REFCNT _Py_CAST(Py_ssize_t, UINT_MAX)
+
+#else
+/*
+In 32 bit systems, an object will be marked as immortal by setting all of the
+lower 30 bits of the reference count field, which is equal to: 0x3FFFFFFF
+
+Using the lower 30 bits makes the value backwards compatible by allowing
+C-Extensions without the updated checks in Py_INCREF and Py_DECREF to safely
+increase and decrease the objects reference count. The object would lose its
+immortality, but the execution would still be correct.
+
+Reference count increases and decreases will first go through an immortality
+check by comparing the reference count field to the immortality reference count.
+*/
+#define _Py_IMMORTAL_REFCNT _Py_CAST(Py_ssize_t, UINT_MAX >> 2)
+#endif
+
+// Py_GIL_DISABLED builds indicate immortal objects using `ob_ref_local`, which is
+// always 32-bits.
+#ifdef Py_GIL_DISABLED
+#define _Py_IMMORTAL_REFCNT_LOCAL UINT32_MAX
+#endif
+
+// Kept for backward compatibility. It was needed by Py_TRACE_REFS build.
+#define _PyObject_EXTRA_INIT
+
+/* Make all uses of PyObject_HEAD_INIT immortal.
+ *
+ * Statically allocated objects might be shared between
+ * interpreters, so must be marked as immortal.
+ */
+#if defined(Py_GIL_DISABLED)
+#define PyObject_HEAD_INIT(type)    \
+    {                               \
+        0,                          \
+        0,                          \
+        { 0 },                      \
+        0,                          \
+        _Py_IMMORTAL_REFCNT_LOCAL,  \
+        0,                          \
+        (type),                     \
+    },
+#else
+#define PyObject_HEAD_INIT(type)    \
+    {                               \
+        { _Py_IMMORTAL_REFCNT },    \
+        (type)                      \
+    },
+#endif
+
+#define PyVarObject_HEAD_INIT(type, size) \
+    {                                     \
+        PyObject_HEAD_INIT(type)          \
+        (size)                            \
+    },
+
+/* PyObject_VAR_HEAD defines the initial segment of all variable-size
+ * container objects.  These end with a declaration of an array with 1
+ * element, but enough space is malloc'ed so that the array actually
+ * has room for ob_size elements.  Note that ob_size is an element count,
+ * not necessarily a byte count.
+ */
+#define PyObject_VAR_HEAD      PyVarObject ob_base;
+#define Py_INVALID_SIZE (Py_ssize_t)-1
+
+/* Nothing is actually declared to be a PyObject, but every pointer to
+ * a Python object can be cast to a PyObject*.  This is inheritance built
+ * by hand.  Similarly every pointer to a variable-size Python object can,
+ * in addition, be cast to PyVarObject*.
+ */
+#ifndef Py_GIL_DISABLED
+struct _object {
+#if (defined(__GNUC__) || defined(__clang__)) \
+        && !(defined __STDC_VERSION__ && __STDC_VERSION__ >= 201112L)
+    // On C99 and older, anonymous union is a GCC and clang extension
+    __extension__
+#endif
+#ifdef _MSC_VER
+    // Ignore MSC warning C4201: "nonstandard extension used:
+    // nameless struct/union"
+    __pragma(warning(push))
+    __pragma(warning(disable: 4201))
+#endif
+    union {
+       Py_ssize_t ob_refcnt;
+#if SIZEOF_VOID_P > 4
+       PY_UINT32_T ob_refcnt_split[2];
+#endif
+    };
+#ifdef _MSC_VER
+    __pragma(warning(pop))
+#endif
+
+    PyTypeObject *ob_type;
+};
+#else
+// Objects that are not owned by any thread use a thread id (tid) of zero.
+// This includes both immortal objects and objects whose reference count
+// fields have been merged.
+#define _Py_UNOWNED_TID             0
+
+// The shared reference count uses the two least-significant bits to store
+// flags. The remaining bits are used to store the reference count.
+#define _Py_REF_SHARED_SHIFT        2
+#define _Py_REF_SHARED_FLAG_MASK    0x3
+
+// The shared flags are initialized to zero.
+#define _Py_REF_SHARED_INIT         0x0
+#define _Py_REF_MAYBE_WEAKREF       0x1
+#define _Py_REF_QUEUED              0x2
+#define _Py_REF_MERGED              0x3
+
+// Create a shared field from a refcnt and desired flags
+#define _Py_REF_SHARED(refcnt, flags) (((refcnt) << _Py_REF_SHARED_SHIFT) + (flags))
+
+struct _object {
+    // ob_tid stores the thread id (or zero). It is also used by the GC and the
+    // trashcan mechanism as a linked list pointer and by the GC to store the
+    // computed "gc_refs" refcount.
+    uintptr_t ob_tid;
+    uint16_t _padding;
+    PyMutex ob_mutex;           // per-object lock
+    uint8_t ob_gc_bits;         // gc-related state
+    uint32_t ob_ref_local;      // local reference count
+    Py_ssize_t ob_ref_shared;   // shared (atomic) reference count
+    PyTypeObject *ob_type;
+};
+#endif
+
+/* Cast argument to PyObject* type. */
+#define _PyObject_CAST(op) _Py_CAST(PyObject*, (op))
+
+typedef struct {
+    PyObject ob_base;
+    Py_ssize_t ob_size; /* Number of items in variable part */
+} PyVarObject;
+
+/* Cast argument to PyVarObject* type. */
+#define _PyVarObject_CAST(op) _Py_CAST(PyVarObject*, (op))
+
+
+// Test if the 'x' object is the 'y' object, the same as "x is y" in Python.
+PyAPI_FUNC(int) Py_Is(PyObject *x, PyObject *y);
+#define Py_Is(x, y) ((x) == (y))
+
+#if defined(Py_GIL_DISABLED) && !defined(Py_LIMITED_API)
+PyAPI_FUNC(uintptr_t) _Py_GetThreadLocal_Addr(void);
+
+static inline uintptr_t
+_Py_ThreadId(void)
+{
+    uintptr_t tid;
+#if defined(_MSC_VER) && defined(_M_X64)
+    tid = __readgsqword(48);
+#elif defined(_MSC_VER) && defined(_M_IX86)
+    tid = __readfsdword(24);
+#elif defined(_MSC_VER) && defined(_M_ARM64)
+    tid = __getReg(18);
+#elif defined(__MINGW32__) && defined(_M_X64)
+    tid = __readgsqword(48);
+#elif defined(__MINGW32__) && defined(_M_IX86)
+    tid = __readfsdword(24);
+#elif defined(__MINGW32__) && defined(_M_ARM64)
+    tid = __getReg(18);
+#elif defined(__i386__)
+    __asm__("movl %%gs:0, %0" : "=r" (tid));  // 32-bit always uses GS
+#elif defined(__MACH__) && defined(__x86_64__)
+    __asm__("movq %%gs:0, %0" : "=r" (tid));  // x86_64 macOSX uses GS
+#elif defined(__x86_64__)
+   __asm__("movq %%fs:0, %0" : "=r" (tid));  // x86_64 Linux, BSD uses FS
+#elif defined(__arm__) && __ARM_ARCH >= 7
+    __asm__ ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tid));
+#elif defined(__aarch64__) && defined(__APPLE__)
+    __asm__ ("mrs %0, tpidrro_el0" : "=r" (tid));
+#elif defined(__aarch64__)
+    __asm__ ("mrs %0, tpidr_el0" : "=r" (tid));
+#elif defined(__powerpc64__)
+    #if defined(__clang__) && _Py__has_builtin(__builtin_thread_pointer)
+    tid = (uintptr_t)__builtin_thread_pointer();
+    #else
+    // r13 is reserved for use as system thread ID by the Power 64-bit ABI.
+    register uintptr_t tp __asm__ ("r13");
+    __asm__("" : "=r" (tp));
+    tid = tp;
+    #endif
+#elif defined(__powerpc__)
+    #if defined(__clang__) && _Py__has_builtin(__builtin_thread_pointer)
+    tid = (uintptr_t)__builtin_thread_pointer();
+    #else
+    // r2 is reserved for use as system thread ID by the Power 32-bit ABI.
+    register uintptr_t tp __asm__ ("r2");
+    __asm__ ("" : "=r" (tp));
+    tid = tp;
+    #endif
+#elif defined(__s390__) && defined(__GNUC__)
+    // Both GCC and Clang have supported __builtin_thread_pointer
+    // for s390 from long time ago.
+    tid = (uintptr_t)__builtin_thread_pointer();
+#elif defined(__riscv)
+    #if defined(__clang__) && _Py__has_builtin(__builtin_thread_pointer)
+    tid = (uintptr_t)__builtin_thread_pointer();
+    #else
+    // tp is Thread Pointer provided by the RISC-V ABI.
+    __asm__ ("mv %0, tp" : "=r" (tid));
+    #endif
+#else
+    // Fallback to a portable implementation if we do not have a faster
+    // platform-specific implementation.
+    tid = _Py_GetThreadLocal_Addr();
+#endif
+  return tid;
+}
+
+static inline Py_ALWAYS_INLINE int
+_Py_IsOwnedByCurrentThread(PyObject *ob)
+{
+#ifdef _Py_THREAD_SANITIZER
+    return _Py_atomic_load_uintptr_relaxed(&ob->ob_tid) == _Py_ThreadId();
+#else
+    return ob->ob_tid == _Py_ThreadId();
+#endif
+}
+#endif
+
+static inline Py_ssize_t Py_REFCNT(PyObject *ob) {
+#if !defined(Py_GIL_DISABLED)
+    return ob->ob_refcnt;
+#else
+    uint32_t local = _Py_atomic_load_uint32_relaxed(&ob->ob_ref_local);
+    if (local == _Py_IMMORTAL_REFCNT_LOCAL) {
+        return _Py_IMMORTAL_REFCNT;
+    }
+    Py_ssize_t shared = _Py_atomic_load_ssize_relaxed(&ob->ob_ref_shared);
+    return _Py_STATIC_CAST(Py_ssize_t, local) +
+           Py_ARITHMETIC_RIGHT_SHIFT(Py_ssize_t, shared, _Py_REF_SHARED_SHIFT);
+#endif
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_REFCNT(ob) Py_REFCNT(_PyObject_CAST(ob))
+#endif
+
+
+// bpo-39573: The Py_SET_TYPE() function must be used to set an object type.
+static inline PyTypeObject* Py_TYPE(PyObject *ob) {
+    return ob->ob_type;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_TYPE(ob) Py_TYPE(_PyObject_CAST(ob))
+#endif
+
+PyAPI_DATA(PyTypeObject) PyLong_Type;
+PyAPI_DATA(PyTypeObject) PyBool_Type;
+
+// bpo-39573: The Py_SET_SIZE() function must be used to set an object size.
+static inline Py_ssize_t Py_SIZE(PyObject *ob) {
+    assert(ob->ob_type != &PyLong_Type);
+    assert(ob->ob_type != &PyBool_Type);
+    return  _PyVarObject_CAST(ob)->ob_size;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_SIZE(ob) Py_SIZE(_PyObject_CAST(ob))
+#endif
+
+static inline Py_ALWAYS_INLINE int _Py_IsImmortal(PyObject *op)
+{
+#if defined(Py_GIL_DISABLED)
+    return (_Py_atomic_load_uint32_relaxed(&op->ob_ref_local) ==
+            _Py_IMMORTAL_REFCNT_LOCAL);
+#elif SIZEOF_VOID_P > 4
+    return (_Py_CAST(PY_INT32_T, op->ob_refcnt) < 0);
+#else
+    return (op->ob_refcnt == _Py_IMMORTAL_REFCNT);
+#endif
+}
+#define _Py_IsImmortal(op) _Py_IsImmortal(_PyObject_CAST(op))
+
+static inline int Py_IS_TYPE(PyObject *ob, PyTypeObject *type) {
+    return Py_TYPE(ob) == type;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_IS_TYPE(ob, type) Py_IS_TYPE(_PyObject_CAST(ob), (type))
+#endif
+
+
+// Py_SET_REFCNT() implementation for stable ABI
+PyAPI_FUNC(void) _Py_SetRefcnt(PyObject *ob, Py_ssize_t refcnt);
+
+static inline void Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt) {
+#if defined(Py_LIMITED_API) && Py_LIMITED_API+0 >= 0x030d0000
+    // Stable ABI implements Py_SET_REFCNT() as a function call
+    // on limited C API version 3.13 and newer.
+    _Py_SetRefcnt(ob, refcnt);
+#else
+    // This immortal check is for code that is unaware of immortal objects.
+    // The runtime tracks these objects and we should avoid as much
+    // as possible having extensions inadvertently change the refcnt
+    // of an immortalized object.
+    if (_Py_IsImmortal(ob)) {
+        return;
+    }
+
+#ifndef Py_GIL_DISABLED
+    ob->ob_refcnt = refcnt;
+#else
+    if (_Py_IsOwnedByCurrentThread(ob)) {
+        if ((size_t)refcnt > (size_t)UINT32_MAX) {
+            // On overflow, make the object immortal
+            ob->ob_tid = _Py_UNOWNED_TID;
+            ob->ob_ref_local = _Py_IMMORTAL_REFCNT_LOCAL;
+            ob->ob_ref_shared = 0;
+        }
+        else {
+            // Set local refcount to desired refcount and shared refcount
+            // to zero, but preserve the shared refcount flags.
+            ob->ob_ref_local = _Py_STATIC_CAST(uint32_t, refcnt);
+            ob->ob_ref_shared &= _Py_REF_SHARED_FLAG_MASK;
+        }
+    }
+    else {
+        // Set local refcount to zero and shared refcount to desired refcount.
+        // Mark the object as merged.
+        ob->ob_tid = _Py_UNOWNED_TID;
+        ob->ob_ref_local = 0;
+        ob->ob_ref_shared = _Py_REF_SHARED(refcnt, _Py_REF_MERGED);
+    }
+#endif  // Py_GIL_DISABLED
+#endif  // Py_LIMITED_API+0 < 0x030d0000
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_SET_REFCNT(ob, refcnt) Py_SET_REFCNT(_PyObject_CAST(ob), (refcnt))
+#endif
+
+
+static inline void Py_SET_TYPE(PyObject *ob, PyTypeObject *type) {
+    ob->ob_type = type;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_SET_TYPE(ob, type) Py_SET_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+static inline void Py_SET_SIZE(PyVarObject *ob, Py_ssize_t size) {
+    assert(ob->ob_base.ob_type != &PyLong_Type);
+    assert(ob->ob_base.ob_type != &PyBool_Type);
+#ifdef Py_GIL_DISABLED
+    _Py_atomic_store_ssize_relaxed(&ob->ob_size, size);
+#else
+    ob->ob_size = size;
+#endif
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_SET_SIZE(ob, size) Py_SET_SIZE(_PyVarObject_CAST(ob), (size))
+#endif
+
+
+/*
+Type objects contain a string containing the type name (to help somewhat
+in debugging), the allocation parameters (see PyObject_New() and
+PyObject_NewVar()),
+and methods for accessing objects of the type.  Methods are optional, a
+nil pointer meaning that particular kind of access is not available for
+this type.  The Py_DECREF() macro uses the tp_dealloc method without
+checking for a nil pointer; it should always be implemented except if
+the implementation can guarantee that the reference count will never
+reach zero (e.g., for statically allocated type objects).
+
+NB: the methods for certain type groups are now contained in separate
+method blocks.
+*/
+
+typedef PyObject * (*unaryfunc)(PyObject *);
+typedef PyObject * (*binaryfunc)(PyObject *, PyObject *);
+typedef PyObject * (*ternaryfunc)(PyObject *, PyObject *, PyObject *);
+typedef int (*inquiry)(PyObject *);
+typedef Py_ssize_t (*lenfunc)(PyObject *);
+typedef PyObject *(*ssizeargfunc)(PyObject *, Py_ssize_t);
+typedef PyObject *(*ssizessizeargfunc)(PyObject *, Py_ssize_t, Py_ssize_t);
+typedef int(*ssizeobjargproc)(PyObject *, Py_ssize_t, PyObject *);
+typedef int(*ssizessizeobjargproc)(PyObject *, Py_ssize_t, Py_ssize_t, PyObject *);
+typedef int(*objobjargproc)(PyObject *, PyObject *, PyObject *);
+
+typedef int (*objobjproc)(PyObject *, PyObject *);
+typedef int (*visitproc)(PyObject *, void *);
+typedef int (*traverseproc)(PyObject *, visitproc, void *);
+
+
+typedef void (*freefunc)(void *);
+typedef void (*destructor)(PyObject *);
+typedef PyObject *(*getattrfunc)(PyObject *, char *);
+typedef PyObject *(*getattrofunc)(PyObject *, PyObject *);
+typedef int (*setattrfunc)(PyObject *, char *, PyObject *);
+typedef int (*setattrofunc)(PyObject *, PyObject *, PyObject *);
+typedef PyObject *(*reprfunc)(PyObject *);
+typedef Py_hash_t (*hashfunc)(PyObject *);
+typedef PyObject *(*richcmpfunc) (PyObject *, PyObject *, int);
+typedef PyObject *(*getiterfunc) (PyObject *);
+typedef PyObject *(*iternextfunc) (PyObject *);
+typedef PyObject *(*descrgetfunc) (PyObject *, PyObject *, PyObject *);
+typedef int (*descrsetfunc) (PyObject *, PyObject *, PyObject *);
+typedef int (*initproc)(PyObject *, PyObject *, PyObject *);
+typedef PyObject *(*newfunc)(PyTypeObject *, PyObject *, PyObject *);
+typedef PyObject *(*allocfunc)(PyTypeObject *, Py_ssize_t);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030c0000 // 3.12
+typedef PyObject *(*vectorcallfunc)(PyObject *callable, PyObject *const *args,
+                                    size_t nargsf, PyObject *kwnames);
+#endif
+
+typedef struct{
+    int slot;    /* slot id, see below */
+    void *pfunc; /* function pointer */
+} PyType_Slot;
+
+typedef struct{
+    const char* name;
+    int basicsize;
+    int itemsize;
+    unsigned int flags;
+    PyType_Slot *slots; /* terminated by slot==0. */
+} PyType_Spec;
+
+PyAPI_FUNC(PyObject*) PyType_FromSpec(PyType_Spec*);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject*) PyType_FromSpecWithBases(PyType_Spec*, PyObject*);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03040000
+PyAPI_FUNC(void*) PyType_GetSlot(PyTypeObject*, int);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+PyAPI_FUNC(PyObject*) PyType_FromModuleAndSpec(PyObject *, PyType_Spec *, PyObject *);
+PyAPI_FUNC(PyObject *) PyType_GetModule(PyTypeObject *);
+PyAPI_FUNC(void *) PyType_GetModuleState(PyTypeObject *);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030B0000
+PyAPI_FUNC(PyObject *) PyType_GetName(PyTypeObject *);
+PyAPI_FUNC(PyObject *) PyType_GetQualName(PyTypeObject *);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
+PyAPI_FUNC(PyObject *) PyType_GetFullyQualifiedName(PyTypeObject *type);
+PyAPI_FUNC(PyObject *) PyType_GetModuleName(PyTypeObject *type);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030C0000
+PyAPI_FUNC(PyObject *) PyType_FromMetaclass(PyTypeObject*, PyObject*, PyType_Spec*, PyObject*);
+PyAPI_FUNC(void *) PyObject_GetTypeData(PyObject *obj, PyTypeObject *cls);
+PyAPI_FUNC(Py_ssize_t) PyType_GetTypeDataSize(PyTypeObject *cls);
+#endif
+
+/* Generic type check */
+PyAPI_FUNC(int) PyType_IsSubtype(PyTypeObject *, PyTypeObject *);
+
+static inline int PyObject_TypeCheck(PyObject *ob, PyTypeObject *type) {
+    return Py_IS_TYPE(ob, type) || PyType_IsSubtype(Py_TYPE(ob), type);
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define PyObject_TypeCheck(ob, type) PyObject_TypeCheck(_PyObject_CAST(ob), (type))
+#endif
+
+PyAPI_DATA(PyTypeObject) PyType_Type; /* built-in 'type' */
+PyAPI_DATA(PyTypeObject) PyBaseObject_Type; /* built-in 'object' */
+PyAPI_DATA(PyTypeObject) PySuper_Type; /* built-in 'super' */
+
+PyAPI_FUNC(unsigned long) PyType_GetFlags(PyTypeObject*);
+
+PyAPI_FUNC(int) PyType_Ready(PyTypeObject *);
+PyAPI_FUNC(PyObject *) PyType_GenericAlloc(PyTypeObject *, Py_ssize_t);
+PyAPI_FUNC(PyObject *) PyType_GenericNew(PyTypeObject *,
+                                               PyObject *, PyObject *);
+PyAPI_FUNC(unsigned int) PyType_ClearCache(void);
+PyAPI_FUNC(void) PyType_Modified(PyTypeObject *);
+
+/* Generic operations on objects */
+PyAPI_FUNC(PyObject *) PyObject_Repr(PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_Str(PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_ASCII(PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_Bytes(PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_RichCompare(PyObject *, PyObject *, int);
+PyAPI_FUNC(int) PyObject_RichCompareBool(PyObject *, PyObject *, int);
+PyAPI_FUNC(PyObject *) PyObject_GetAttrString(PyObject *, const char *);
+PyAPI_FUNC(int) PyObject_SetAttrString(PyObject *, const char *, PyObject *);
+PyAPI_FUNC(int) PyObject_DelAttrString(PyObject *v, const char *name);
+PyAPI_FUNC(int) PyObject_HasAttrString(PyObject *, const char *);
+PyAPI_FUNC(PyObject *) PyObject_GetAttr(PyObject *, PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+PyAPI_FUNC(int) PyObject_GetOptionalAttr(PyObject *, PyObject *, PyObject **);
+PyAPI_FUNC(int) PyObject_GetOptionalAttrString(PyObject *, const char *, PyObject **);
+#endif
+PyAPI_FUNC(int) PyObject_SetAttr(PyObject *, PyObject *, PyObject *);
+PyAPI_FUNC(int) PyObject_DelAttr(PyObject *v, PyObject *name);
+PyAPI_FUNC(int) PyObject_HasAttr(PyObject *, PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+PyAPI_FUNC(int) PyObject_HasAttrWithError(PyObject *, PyObject *);
+PyAPI_FUNC(int) PyObject_HasAttrStringWithError(PyObject *, const char *);
+#endif
+PyAPI_FUNC(PyObject *) PyObject_SelfIter(PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_GenericGetAttr(PyObject *, PyObject *);
+PyAPI_FUNC(int) PyObject_GenericSetAttr(PyObject *, PyObject *, PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(int) PyObject_GenericSetDict(PyObject *, PyObject *, void *);
+#endif
+PyAPI_FUNC(Py_hash_t) PyObject_Hash(PyObject *);
+PyAPI_FUNC(Py_hash_t) PyObject_HashNotImplemented(PyObject *);
+PyAPI_FUNC(int) PyObject_IsTrue(PyObject *);
+PyAPI_FUNC(int) PyObject_Not(PyObject *);
+PyAPI_FUNC(int) PyCallable_Check(PyObject *);
+PyAPI_FUNC(void) PyObject_ClearWeakRefs(PyObject *);
+
+/* PyObject_Dir(obj) acts like Python builtins.dir(obj), returning a
+   list of strings.  PyObject_Dir(NULL) is like builtins.dir(),
+   returning the names of the current locals.  In this case, if there are
+   no current locals, NULL is returned, and PyErr_Occurred() is false.
+*/
+PyAPI_FUNC(PyObject *) PyObject_Dir(PyObject *);
+
+/* Helpers for printing recursive container types */
+PyAPI_FUNC(int) Py_ReprEnter(PyObject *);
+PyAPI_FUNC(void) Py_ReprLeave(PyObject *);
+
+/* Flag bits for printing: */
+#define Py_PRINT_RAW    1       /* No string quotes etc. */
+
+/*
+Type flags (tp_flags)
+
+These flags are used to change expected features and behavior for a
+particular type.
+
+Arbitration of the flag bit positions will need to be coordinated among
+all extension writers who publicly release their extensions (this will
+be fewer than you might expect!).
+
+Most flags were removed as of Python 3.0 to make room for new flags.  (Some
+flags are not for backwards compatibility but to indicate the presence of an
+optional feature; these flags remain of course.)
+
+Type definitions should use Py_TPFLAGS_DEFAULT for their tp_flags value.
+
+Code can use PyType_HasFeature(type_ob, flag_value) to test whether the
+given type object has a specified feature.
+*/
+
+#ifndef Py_LIMITED_API
+
+/* Track types initialized using _PyStaticType_InitBuiltin(). */
+#define _Py_TPFLAGS_STATIC_BUILTIN (1 << 1)
+
+/* The values array is placed inline directly after the rest of
+ * the object. Implies Py_TPFLAGS_HAVE_GC.
+ */
+#define Py_TPFLAGS_INLINE_VALUES (1 << 2)
+
+/* Placement of weakref pointers are managed by the VM, not by the type.
+ * The VM will automatically set tp_weaklistoffset.
+ */
+#define Py_TPFLAGS_MANAGED_WEAKREF (1 << 3)
+
+/* Placement of dict (and values) pointers are managed by the VM, not by the type.
+ * The VM will automatically set tp_dictoffset. Implies Py_TPFLAGS_HAVE_GC.
+ */
+#define Py_TPFLAGS_MANAGED_DICT (1 << 4)
+
+#define Py_TPFLAGS_PREHEADER (Py_TPFLAGS_MANAGED_WEAKREF | Py_TPFLAGS_MANAGED_DICT)
+
+/* Set if instances of the type object are treated as sequences for pattern matching */
+#define Py_TPFLAGS_SEQUENCE (1 << 5)
+/* Set if instances of the type object are treated as mappings for pattern matching */
+#define Py_TPFLAGS_MAPPING (1 << 6)
+#endif
+
+/* Disallow creating instances of the type: set tp_new to NULL and don't create
+ * the "__new__" key in the type dictionary. */
+#define Py_TPFLAGS_DISALLOW_INSTANTIATION (1UL << 7)
+
+/* Set if the type object is immutable: type attributes cannot be set nor deleted */
+#define Py_TPFLAGS_IMMUTABLETYPE (1UL << 8)
+
+/* Set if the type object is dynamically allocated */
+#define Py_TPFLAGS_HEAPTYPE (1UL << 9)
+
+/* Set if the type allows subclassing */
+#define Py_TPFLAGS_BASETYPE (1UL << 10)
+
+/* Set if the type implements the vectorcall protocol (PEP 590) */
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030C0000
+#define Py_TPFLAGS_HAVE_VECTORCALL (1UL << 11)
+#ifndef Py_LIMITED_API
+// Backwards compatibility alias for API that was provisional in Python 3.8
+#define _Py_TPFLAGS_HAVE_VECTORCALL Py_TPFLAGS_HAVE_VECTORCALL
+#endif
+#endif
+
+/* Set if the type is 'ready' -- fully initialized */
+#define Py_TPFLAGS_READY (1UL << 12)
+
+/* Set while the type is being 'readied', to prevent recursive ready calls */
+#define Py_TPFLAGS_READYING (1UL << 13)
+
+/* Objects support garbage collection (see objimpl.h) */
+#define Py_TPFLAGS_HAVE_GC (1UL << 14)
+
+/* These two bits are preserved for Stackless Python, next after this is 17 */
+#ifdef STACKLESS
+#define Py_TPFLAGS_HAVE_STACKLESS_EXTENSION (3UL << 15)
+#else
+#define Py_TPFLAGS_HAVE_STACKLESS_EXTENSION 0
+#endif
+
+/* Objects behave like an unbound method */
+#define Py_TPFLAGS_METHOD_DESCRIPTOR (1UL << 17)
+
+/* Unused. Legacy flag */
+#define Py_TPFLAGS_VALID_VERSION_TAG  (1UL << 19)
+
+/* Type is abstract and cannot be instantiated */
+#define Py_TPFLAGS_IS_ABSTRACT (1UL << 20)
+
+// This undocumented flag gives certain built-ins their unique pattern-matching
+// behavior, which allows a single positional subpattern to match against the
+// subject itself (rather than a mapped attribute on it):
+#define _Py_TPFLAGS_MATCH_SELF (1UL << 22)
+
+/* Items (ob_size*tp_itemsize) are found at the end of an instance's memory */
+#define Py_TPFLAGS_ITEMS_AT_END (1UL << 23)
+
+/* These flags are used to determine if a type is a subclass. */
+#define Py_TPFLAGS_LONG_SUBCLASS        (1UL << 24)
+#define Py_TPFLAGS_LIST_SUBCLASS        (1UL << 25)
+#define Py_TPFLAGS_TUPLE_SUBCLASS       (1UL << 26)
+#define Py_TPFLAGS_BYTES_SUBCLASS       (1UL << 27)
+#define Py_TPFLAGS_UNICODE_SUBCLASS     (1UL << 28)
+#define Py_TPFLAGS_DICT_SUBCLASS        (1UL << 29)
+#define Py_TPFLAGS_BASE_EXC_SUBCLASS    (1UL << 30)
+#define Py_TPFLAGS_TYPE_SUBCLASS        (1UL << 31)
+
+#define Py_TPFLAGS_DEFAULT  ( \
+                 Py_TPFLAGS_HAVE_STACKLESS_EXTENSION | \
+                0)
+
+/* NOTE: Some of the following flags reuse lower bits (removed as part of the
+ * Python 3.0 transition). */
+
+/* The following flags are kept for compatibility; in previous
+ * versions they indicated presence of newer tp_* fields on the
+ * type struct.
+ * Starting with 3.8, binary compatibility of C extensions across
+ * feature releases of Python is not supported anymore (except when
+ * using the stable ABI, in which all classes are created dynamically,
+ * using the interpreter's memory layout.)
+ * Note that older extensions using the stable ABI set these flags,
+ * so the bits must not be repurposed.
+ */
+#define Py_TPFLAGS_HAVE_FINALIZE (1UL << 0)
+#define Py_TPFLAGS_HAVE_VERSION_TAG   (1UL << 18)
+
+
+/*
+The macros Py_INCREF(op) and Py_DECREF(op) are used to increment or decrement
+reference counts.  Py_DECREF calls the object's deallocator function when
+the refcount falls to 0; for
+objects that don't contain references to other objects or heap memory
+this can be the standard function free().  Both macros can be used
+wherever a void expression is allowed.  The argument must not be a
+NULL pointer.  If it may be NULL, use Py_XINCREF/Py_XDECREF instead.
+The macro _Py_NewReference(op) initialize reference counts to 1, and
+in special builds (Py_REF_DEBUG, Py_TRACE_REFS) performs additional
+bookkeeping appropriate to the special build.
+
+We assume that the reference count field can never overflow; this can
+be proven when the size of the field is the same as the pointer size, so
+we ignore the possibility.  Provided a C int is at least 32 bits (which
+is implicitly assumed in many parts of this code), that's enough for
+about 2**31 references to an object.
+
+XXX The following became out of date in Python 2.2, but I'm not sure
+XXX what the full truth is now.  Certainly, heap-allocated type objects
+XXX can and should be deallocated.
+Type objects should never be deallocated; the type pointer in an object
+is not considered to be a reference to the type object, to save
+complications in the deallocation function.  (This is actually a
+decision that's up to the implementer of each new type so if you want,
+you can count such references to the type object.)
+*/
+
+#if defined(Py_REF_DEBUG) && !defined(Py_LIMITED_API)
+PyAPI_FUNC(void) _Py_NegativeRefcount(const char *filename, int lineno,
+                                      PyObject *op);
+PyAPI_FUNC(void) _Py_INCREF_IncRefTotal(void);
+PyAPI_FUNC(void) _Py_DECREF_DecRefTotal(void);
+#endif  // Py_REF_DEBUG && !Py_LIMITED_API
+
+PyAPI_FUNC(void) _Py_Dealloc(PyObject *);
+
+/*
+These are provided as conveniences to Python runtime embedders, so that
+they can have object code that is not dependent on Python compilation flags.
+*/
+PyAPI_FUNC(void) Py_IncRef(PyObject *);
+PyAPI_FUNC(void) Py_DecRef(PyObject *);
+
+// Similar to Py_IncRef() and Py_DecRef() but the argument must be non-NULL.
+// Private functions used by Py_INCREF() and Py_DECREF().
+PyAPI_FUNC(void) _Py_IncRef(PyObject *);
+PyAPI_FUNC(void) _Py_DecRef(PyObject *);
+
+static inline Py_ALWAYS_INLINE void Py_INCREF(PyObject *op)
+{
+#if defined(Py_LIMITED_API) && (Py_LIMITED_API+0 >= 0x030c0000 || defined(Py_REF_DEBUG))
+    // Stable ABI implements Py_INCREF() as a function call on limited C API
+    // version 3.12 and newer, and on Python built in debug mode. _Py_IncRef()
+    // was added to Python 3.10.0a7, use Py_IncRef() on older Python versions.
+    // Py_IncRef() accepts NULL whereas _Py_IncRef() doesn't.
+#  if Py_LIMITED_API+0 >= 0x030a00A7
+    _Py_IncRef(op);
+#  else
+    Py_IncRef(op);
+#  endif
+#else
+    // Non-limited C API and limited C API for Python 3.9 and older access
+    // directly PyObject.ob_refcnt.
+#if defined(Py_GIL_DISABLED)
+    uint32_t local = _Py_atomic_load_uint32_relaxed(&op->ob_ref_local);
+    uint32_t new_local = local + 1;
+    if (new_local == 0) {
+        // local is equal to _Py_IMMORTAL_REFCNT: do nothing
+        return;
+    }
+    if (_Py_IsOwnedByCurrentThread(op)) {
+        _Py_atomic_store_uint32_relaxed(&op->ob_ref_local, new_local);
+    }
+    else {
+        _Py_atomic_add_ssize(&op->ob_ref_shared, (1 << _Py_REF_SHARED_SHIFT));
+    }
+#elif SIZEOF_VOID_P > 4
+    // Portable saturated add, branching on the carry flag and set low bits
+    PY_UINT32_T cur_refcnt = op->ob_refcnt_split[PY_BIG_ENDIAN];
+    PY_UINT32_T new_refcnt = cur_refcnt + 1;
+    if (new_refcnt == 0) {
+        // cur_refcnt is equal to _Py_IMMORTAL_REFCNT: the object is immortal,
+        // do nothing
+        return;
+    }
+    op->ob_refcnt_split[PY_BIG_ENDIAN] = new_refcnt;
+#else
+    // Explicitly check immortality against the immortal value
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    op->ob_refcnt++;
+#endif
+    _Py_INCREF_STAT_INC();
+#ifdef Py_REF_DEBUG
+    _Py_INCREF_IncRefTotal();
+#endif
+#endif
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_INCREF(op) Py_INCREF(_PyObject_CAST(op))
+#endif
+
+
+#if !defined(Py_LIMITED_API) && defined(Py_GIL_DISABLED)
+// Implements Py_DECREF on objects not owned by the current thread.
+PyAPI_FUNC(void) _Py_DecRefShared(PyObject *);
+PyAPI_FUNC(void) _Py_DecRefSharedDebug(PyObject *, const char *, int);
+
+// Called from Py_DECREF by the owning thread when the local refcount reaches
+// zero. The call will deallocate the object if the shared refcount is also
+// zero. Otherwise, the thread gives up ownership and merges the reference
+// count fields.
+PyAPI_FUNC(void) _Py_MergeZeroLocalRefcount(PyObject *);
+#endif
+
+#if defined(Py_LIMITED_API) && (Py_LIMITED_API+0 >= 0x030c0000 || defined(Py_REF_DEBUG))
+// Stable ABI implements Py_DECREF() as a function call on limited C API
+// version 3.12 and newer, and on Python built in debug mode. _Py_DecRef() was
+// added to Python 3.10.0a7, use Py_DecRef() on older Python versions.
+// Py_DecRef() accepts NULL whereas _Py_IncRef() doesn't.
+static inline void Py_DECREF(PyObject *op) {
+#  if Py_LIMITED_API+0 >= 0x030a00A7
+    _Py_DecRef(op);
+#  else
+    Py_DecRef(op);
+#  endif
+}
+#define Py_DECREF(op) Py_DECREF(_PyObject_CAST(op))
+
+#elif defined(Py_GIL_DISABLED) && defined(Py_REF_DEBUG)
+static inline void Py_DECREF(const char *filename, int lineno, PyObject *op)
+{
+    uint32_t local = _Py_atomic_load_uint32_relaxed(&op->ob_ref_local);
+    if (local == _Py_IMMORTAL_REFCNT_LOCAL) {
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+    _Py_DECREF_DecRefTotal();
+    if (_Py_IsOwnedByCurrentThread(op)) {
+        if (local == 0) {
+            _Py_NegativeRefcount(filename, lineno, op);
+        }
+        local--;
+        _Py_atomic_store_uint32_relaxed(&op->ob_ref_local, local);
+        if (local == 0) {
+            _Py_MergeZeroLocalRefcount(op);
+        }
+    }
+    else {
+        _Py_DecRefSharedDebug(op, filename, lineno);
+    }
+}
+#define Py_DECREF(op) Py_DECREF(__FILE__, __LINE__, _PyObject_CAST(op))
+
+#elif defined(Py_GIL_DISABLED)
+static inline void Py_DECREF(PyObject *op)
+{
+    uint32_t local = _Py_atomic_load_uint32_relaxed(&op->ob_ref_local);
+    if (local == _Py_IMMORTAL_REFCNT_LOCAL) {
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+    if (_Py_IsOwnedByCurrentThread(op)) {
+        local--;
+        _Py_atomic_store_uint32_relaxed(&op->ob_ref_local, local);
+        if (local == 0) {
+            _Py_MergeZeroLocalRefcount(op);
+        }
+    }
+    else {
+        _Py_DecRefShared(op);
+    }
+}
+#define Py_DECREF(op) Py_DECREF(_PyObject_CAST(op))
+
+#elif defined(Py_REF_DEBUG)
+static inline void Py_DECREF(const char *filename, int lineno, PyObject *op)
+{
+    if (op->ob_refcnt <= 0) {
+        _Py_NegativeRefcount(filename, lineno, op);
+    }
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+    _Py_DECREF_DecRefTotal();
+    if (--op->ob_refcnt == 0) {
+        _Py_Dealloc(op);
+    }
+}
+#define Py_DECREF(op) Py_DECREF(__FILE__, __LINE__, _PyObject_CAST(op))
+
+#else
+static inline Py_ALWAYS_INLINE void Py_DECREF(PyObject *op)
+{
+    // Non-limited C API and limited C API for Python 3.9 and older access
+    // directly PyObject.ob_refcnt.
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+    if (--op->ob_refcnt == 0) {
+        _Py_Dealloc(op);
+    }
+}
+#define Py_DECREF(op) Py_DECREF(_PyObject_CAST(op))
+#endif
+
+
+/* Safely decref `op` and set `op` to NULL, especially useful in tp_clear
+ * and tp_dealloc implementations.
+ *
+ * Note that "the obvious" code can be deadly:
+ *
+ *     Py_XDECREF(op);
+ *     op = NULL;
+ *
+ * Typically, `op` is something like self->containee, and `self` is done
+ * using its `containee` member.  In the code sequence above, suppose
+ * `containee` is non-NULL with a refcount of 1.  Its refcount falls to
+ * 0 on the first line, which can trigger an arbitrary amount of code,
+ * possibly including finalizers (like __del__ methods or weakref callbacks)
+ * coded in Python, which in turn can release the GIL and allow other threads
+ * to run, etc.  Such code may even invoke methods of `self` again, or cause
+ * cyclic gc to trigger, but-- oops! --self->containee still points to the
+ * object being torn down, and it may be in an insane state while being torn
+ * down.  This has in fact been a rich historic source of miserable (rare &
+ * hard-to-diagnose) segfaulting (and other) bugs.
+ *
+ * The safe way is:
+ *
+ *      Py_CLEAR(op);
+ *
+ * That arranges to set `op` to NULL _before_ decref'ing, so that any code
+ * triggered as a side-effect of `op` getting torn down no longer believes
+ * `op` points to a valid object.
+ *
+ * There are cases where it's safe to use the naive code, but they're brittle.
+ * For example, if `op` points to a Python integer, you know that destroying
+ * one of those can't cause problems -- but in part that relies on that
+ * Python integers aren't currently weakly referencable.  Best practice is
+ * to use Py_CLEAR() even if you can't think of a reason for why you need to.
+ *
+ * gh-98724: Use a temporary variable to only evaluate the macro argument once,
+ * to avoid the duplication of side effects if the argument has side effects.
+ *
+ * gh-99701: If the PyObject* type is used with casting arguments to PyObject*,
+ * the code can be miscompiled with strict aliasing because of type punning.
+ * With strict aliasing, a compiler considers that two pointers of different
+ * types cannot read or write the same memory which enables optimization
+ * opportunities.
+ *
+ * If available, use _Py_TYPEOF() to use the 'op' type for temporary variables,
+ * and so avoid type punning. Otherwise, use memcpy() which causes type erasure
+ * and so prevents the compiler to reuse an old cached 'op' value after
+ * Py_CLEAR().
+ */
+#ifdef _Py_TYPEOF
+#define Py_CLEAR(op) \
+    do { \
+        _Py_TYPEOF(op)* _tmp_op_ptr = &(op); \
+        _Py_TYPEOF(op) _tmp_old_op = (*_tmp_op_ptr); \
+        if (_tmp_old_op != NULL) { \
+            *_tmp_op_ptr = _Py_NULL; \
+            Py_DECREF(_tmp_old_op); \
+        } \
+    } while (0)
+#else
+#define Py_CLEAR(op) \
+    do { \
+        PyObject **_tmp_op_ptr = _Py_CAST(PyObject**, &(op)); \
+        PyObject *_tmp_old_op = (*_tmp_op_ptr); \
+        if (_tmp_old_op != NULL) { \
+            PyObject *_null_ptr = _Py_NULL; \
+            memcpy(_tmp_op_ptr, &_null_ptr, sizeof(PyObject*)); \
+            Py_DECREF(_tmp_old_op); \
+        } \
+    } while (0)
+#endif
+
+
+/* Function to use in case the object pointer can be NULL: */
+static inline void Py_XINCREF(PyObject *op)
+{
+    if (op != _Py_NULL) {
+        Py_INCREF(op);
+    }
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_XINCREF(op) Py_XINCREF(_PyObject_CAST(op))
+#endif
+
+static inline void Py_XDECREF(PyObject *op)
+{
+    if (op != _Py_NULL) {
+        Py_DECREF(op);
+    }
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_XDECREF(op) Py_XDECREF(_PyObject_CAST(op))
+#endif
+
+// Create a new strong reference to an object:
+// increment the reference count of the object and return the object.
+PyAPI_FUNC(PyObject*) Py_NewRef(PyObject *obj);
+
+// Similar to Py_NewRef(), but the object can be NULL.
+PyAPI_FUNC(PyObject*) Py_XNewRef(PyObject *obj);
+
+static inline PyObject* _Py_NewRef(PyObject *obj)
+{
+    Py_INCREF(obj);
+    return obj;
+}
+
+static inline PyObject* _Py_XNewRef(PyObject *obj)
+{
+    Py_XINCREF(obj);
+    return obj;
+}
+
+// Py_NewRef() and Py_XNewRef() are exported as functions for the stable ABI.
+// Names overridden with macros by static inline functions for best
+// performances.
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_NewRef(obj) _Py_NewRef(_PyObject_CAST(obj))
+#  define Py_XNewRef(obj) _Py_XNewRef(_PyObject_CAST(obj))
+#else
+#  define Py_NewRef(obj) _Py_NewRef(obj)
+#  define Py_XNewRef(obj) _Py_XNewRef(obj)
+#endif
+
+
+#define Py_CONSTANT_NONE 0
+#define Py_CONSTANT_FALSE 1
+#define Py_CONSTANT_TRUE 2
+#define Py_CONSTANT_ELLIPSIS 3
+#define Py_CONSTANT_NOT_IMPLEMENTED 4
+#define Py_CONSTANT_ZERO 5
+#define Py_CONSTANT_ONE 6
+#define Py_CONSTANT_EMPTY_STR 7
+#define Py_CONSTANT_EMPTY_BYTES 8
+#define Py_CONSTANT_EMPTY_TUPLE 9
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+PyAPI_FUNC(PyObject*) Py_GetConstant(unsigned int constant_id);
+PyAPI_FUNC(PyObject*) Py_GetConstantBorrowed(unsigned int constant_id);
+#endif
+
+
+/*
+_Py_NoneStruct is an object of undefined type which can be used in contexts
+where NULL (nil) is not suitable (since NULL often means 'error').
+*/
+PyAPI_DATA(PyObject) _Py_NoneStruct; /* Don't use this directly */
+
+#if defined(Py_LIMITED_API) && Py_LIMITED_API+0 >= 0x030D0000
+#  define Py_None Py_GetConstantBorrowed(Py_CONSTANT_NONE)
+#else
+#  define Py_None (&_Py_NoneStruct)
+#endif
+
+// Test if an object is the None singleton, the same as "x is None" in Python.
+PyAPI_FUNC(int) Py_IsNone(PyObject *x);
+#define Py_IsNone(x) Py_Is((x), Py_None)
+
+/* Macro for returning Py_None from a function.
+ * Only treat Py_None as immortal in the limited C API 3.12 and newer. */
+#if defined(Py_LIMITED_API) && Py_LIMITED_API+0 < 0x030c0000
+#  define Py_RETURN_NONE return Py_NewRef(Py_None)
+#else
+#  define Py_RETURN_NONE return Py_None
+#endif
+
+/*
+Py_NotImplemented is a singleton used to signal that an operation is
+not implemented for a given type combination.
+*/
+PyAPI_DATA(PyObject) _Py_NotImplementedStruct; /* Don't use this directly */
+
+#if defined(Py_LIMITED_API) && Py_LIMITED_API+0 >= 0x030D0000
+#  define Py_NotImplemented Py_GetConstantBorrowed(Py_CONSTANT_NOT_IMPLEMENTED)
+#else
+#  define Py_NotImplemented (&_Py_NotImplementedStruct)
+#endif
+
+/* Macro for returning Py_NotImplemented from a function */
+#define Py_RETURN_NOTIMPLEMENTED return Py_NotImplemented
+
+/* Rich comparison opcodes */
+#define Py_LT 0
+#define Py_LE 1
+#define Py_EQ 2
+#define Py_NE 3
+#define Py_GT 4
+#define Py_GE 5
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+/* Result of calling PyIter_Send */
+typedef enum {
+    PYGEN_RETURN = 0,
+    PYGEN_ERROR = -1,
+    PYGEN_NEXT = 1,
+} PySendResult;
+#endif
+
+/*
+ * Macro for implementing rich comparisons
+ *
+ * Needs to be a macro because any C-comparable type can be used.
+ */
+#define Py_RETURN_RICHCOMPARE(val1, val2, op)                               \
+    do {                                                                    \
+        switch (op) {                                                       \
+        case Py_EQ: if ((val1) == (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;  \
+        case Py_NE: if ((val1) != (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;  \
+        case Py_LT: if ((val1) < (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;   \
+        case Py_GT: if ((val1) > (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;   \
+        case Py_LE: if ((val1) <= (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;  \
+        case Py_GE: if ((val1) >= (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;  \
+        default:                                                            \
+            Py_UNREACHABLE();                                               \
+        }                                                                   \
+    } while (0)
+
+
+/*
+More conventions
+================
+
+Argument Checking
+-----------------
+
+Functions that take objects as arguments normally don't check for nil
+arguments, but they do check the type of the argument, and return an
+error if the function doesn't apply to the type.
+
+Failure Modes
+-------------
+
+Functions may fail for a variety of reasons, including running out of
+memory.  This is communicated to the caller in two ways: an error string
+is set (see errors.h), and the function result differs: functions that
+normally return a pointer return NULL for failure, functions returning
+an integer return -1 (which could be a legal return value too!), and
+other functions return 0 for success and -1 for failure.
+Callers should always check for errors before using the result.  If
+an error was set, the caller must either explicitly clear it, or pass
+the error on to its caller.
+
+Reference Counts
+----------------
+
+It takes a while to get used to the proper usage of reference counts.
+
+Functions that create an object set the reference count to 1; such new
+objects must be stored somewhere or destroyed again with Py_DECREF().
+Some functions that 'store' objects, such as PyTuple_SetItem() and
+PyList_SetItem(),
+don't increment the reference count of the object, since the most
+frequent use is to store a fresh object.  Functions that 'retrieve'
+objects, such as PyTuple_GetItem() and PyDict_GetItemString(), also
+don't increment
+the reference count, since most frequently the object is only looked at
+quickly.  Thus, to retrieve an object and store it again, the caller
+must call Py_INCREF() explicitly.
+
+NOTE: functions that 'consume' a reference count, like
+PyList_SetItem(), consume the reference even if the object wasn't
+successfully stored, to simplify error handling.
+
+It seems attractive to make other functions that take an object as
+argument consume a reference count; however, this may quickly get
+confusing (even the current practice is already confusing).  Consider
+it carefully, it may save lots of calls to Py_INCREF() and Py_DECREF() at
+times.
+*/
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_OBJECT_H
+#  include "cpython/object.h"
+#  undef Py_CPYTHON_OBJECT_H
+#endif
+
+
+static inline int
+PyType_HasFeature(PyTypeObject *type, unsigned long feature)
+{
+    unsigned long flags;
+#ifdef Py_LIMITED_API
+    // PyTypeObject is opaque in the limited C API
+    flags = PyType_GetFlags(type);
+#else
+#   ifdef Py_GIL_DISABLED
+        flags = _Py_atomic_load_ulong_relaxed(&type->tp_flags);
+#   else
+        flags = type->tp_flags;
+#   endif
+#endif
+    return ((flags & feature) != 0);
+}
+
+#define PyType_FastSubclass(type, flag) PyType_HasFeature((type), (flag))
+
+static inline int PyType_Check(PyObject *op) {
+    return PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_TYPE_SUBCLASS);
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define PyType_Check(op) PyType_Check(_PyObject_CAST(op))
+#endif
+
+#define _PyType_CAST(op) \
+    (assert(PyType_Check(op)), _Py_CAST(PyTypeObject*, (op)))
+
+static inline int PyType_CheckExact(PyObject *op) {
+    return Py_IS_TYPE(op, &PyType_Type);
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define PyType_CheckExact(op) PyType_CheckExact(_PyObject_CAST(op))
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+PyAPI_FUNC(PyObject *) PyType_GetModuleByDef(PyTypeObject *, PyModuleDef *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_OBJECT_H
diff --git a/Include/objimpl.h b/Include/objimpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..56472a72e42d341a56931646081d0221cf8113b2
--- /dev/null
+++ b/Include/objimpl.h
@@ -0,0 +1,211 @@
+// The PyObject_ memory family:  high-level object memory interfaces.
+// See pymem.h for the low-level PyMem_ family.
+
+#ifndef Py_OBJIMPL_H
+#define Py_OBJIMPL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* BEWARE:
+
+   Each interface exports both functions and macros.  Extension modules should
+   use the functions, to ensure binary compatibility across Python versions.
+   Because the Python implementation is free to change internal details, and
+   the macros may (or may not) expose details for speed, if you do use the
+   macros you must recompile your extensions with each Python release.
+
+   Never mix calls to PyObject_ memory functions with calls to the platform
+   malloc/realloc/ calloc/free, or with calls to PyMem_.
+*/
+
+/*
+Functions and macros for modules that implement new object types.
+
+ - PyObject_New(type, typeobj) allocates memory for a new object of the given
+   type, and initializes part of it.  'type' must be the C structure type used
+   to represent the object, and 'typeobj' the address of the corresponding
+   type object.  Reference count and type pointer are filled in; the rest of
+   the bytes of the object are *undefined*!  The resulting expression type is
+   'type *'.  The size of the object is determined by the tp_basicsize field
+   of the type object.
+
+ - PyObject_NewVar(type, typeobj, n) is similar but allocates a variable-size
+   object with room for n items.  In addition to the refcount and type pointer
+   fields, this also fills in the ob_size field.
+
+ - PyObject_Free(op) releases the memory allocated for an object.  It does not
+   run a destructor -- it only frees the memory.
+
+ - PyObject_Init(op, typeobj) and PyObject_InitVar(op, typeobj, n) don't
+   allocate memory.  Instead of a 'type' parameter, they take a pointer to a
+   new object (allocated by an arbitrary allocator), and initialize its object
+   header fields.
+
+Note that objects created with PyObject_{New, NewVar} are allocated using the
+specialized Python allocator (implemented in obmalloc.c), if WITH_PYMALLOC is
+enabled.  In addition, a special debugging allocator is used if Py_DEBUG
+macro is also defined.
+
+In case a specific form of memory management is needed (for example, if you
+must use the platform malloc heap(s), or shared memory, or C++ local storage or
+operator new), you must first allocate the object with your custom allocator,
+then pass its pointer to PyObject_{Init, InitVar} for filling in its Python-
+specific fields:  reference count, type pointer, possibly others.  You should
+be aware that Python has no control over these objects because they don't
+cooperate with the Python memory manager.  Such objects may not be eligible
+for automatic garbage collection and you have to make sure that they are
+released accordingly whenever their destructor gets called (cf. the specific
+form of memory management you're using).
+
+Unless you have specific memory management requirements, use
+PyObject_{New, NewVar, Del}.
+*/
+
+/*
+ * Raw object memory interface
+ * ===========================
+ */
+
+/* Functions to call the same malloc/realloc/free as used by Python's
+   object allocator.  If WITH_PYMALLOC is enabled, these may differ from
+   the platform malloc/realloc/free.  The Python object allocator is
+   designed for fast, cache-conscious allocation of many "small" objects,
+   and with low hidden memory overhead.
+
+   PyObject_Malloc(0) returns a unique non-NULL pointer if possible.
+
+   PyObject_Realloc(NULL, n) acts like PyObject_Malloc(n).
+   PyObject_Realloc(p != NULL, 0) does not return  NULL, or free the memory
+   at p.
+
+   Returned pointers must be checked for NULL explicitly; no action is
+   performed on failure other than to return NULL (no warning it printed, no
+   exception is set, etc).
+
+   For allocating objects, use PyObject_{New, NewVar} instead whenever
+   possible.  The PyObject_{Malloc, Realloc, Free} family is exposed
+   so that you can exploit Python's small-block allocator for non-object
+   uses.  If you must use these routines to allocate object memory, make sure
+   the object gets initialized via PyObject_{Init, InitVar} after obtaining
+   the raw memory.
+*/
+PyAPI_FUNC(void *) PyObject_Malloc(size_t size);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_FUNC(void *) PyObject_Calloc(size_t nelem, size_t elsize);
+#endif
+PyAPI_FUNC(void *) PyObject_Realloc(void *ptr, size_t new_size);
+PyAPI_FUNC(void) PyObject_Free(void *ptr);
+
+
+// Deprecated aliases only kept for backward compatibility.
+// PyObject_Del and PyObject_DEL are defined with no parameter to be able to
+// use them as function pointers (ex: tp_free = PyObject_Del).
+#define PyObject_MALLOC         PyObject_Malloc
+#define PyObject_REALLOC        PyObject_Realloc
+#define PyObject_FREE           PyObject_Free
+#define PyObject_Del            PyObject_Free
+#define PyObject_DEL            PyObject_Free
+
+
+/*
+ * Generic object allocator interface
+ * ==================================
+ */
+
+/* Functions */
+PyAPI_FUNC(PyObject *) PyObject_Init(PyObject *, PyTypeObject *);
+PyAPI_FUNC(PyVarObject *) PyObject_InitVar(PyVarObject *,
+                                           PyTypeObject *, Py_ssize_t);
+
+#define PyObject_INIT(op, typeobj) \
+    PyObject_Init(_PyObject_CAST(op), (typeobj))
+#define PyObject_INIT_VAR(op, typeobj, size) \
+    PyObject_InitVar(_PyVarObject_CAST(op), (typeobj), (size))
+
+
+PyAPI_FUNC(PyObject *) _PyObject_New(PyTypeObject *);
+PyAPI_FUNC(PyVarObject *) _PyObject_NewVar(PyTypeObject *, Py_ssize_t);
+
+#define PyObject_New(type, typeobj) ((type *)_PyObject_New(typeobj))
+
+// Alias to PyObject_New(). In Python 3.8, PyObject_NEW() called directly
+// PyObject_MALLOC() with _PyObject_SIZE().
+#define PyObject_NEW(type, typeobj) PyObject_New(type, (typeobj))
+
+#define PyObject_NewVar(type, typeobj, n) \
+                ( (type *) _PyObject_NewVar((typeobj), (n)) )
+
+// Alias to PyObject_NewVar(). In Python 3.8, PyObject_NEW_VAR() called
+// directly PyObject_MALLOC() with _PyObject_VAR_SIZE().
+#define PyObject_NEW_VAR(type, typeobj, n) PyObject_NewVar(type, (typeobj), (n))
+
+
+/*
+ * Garbage Collection Support
+ * ==========================
+ */
+
+/* C equivalent of gc.collect(). */
+PyAPI_FUNC(Py_ssize_t) PyGC_Collect(void);
+/* C API for controlling the state of the garbage collector */
+PyAPI_FUNC(int) PyGC_Enable(void);
+PyAPI_FUNC(int) PyGC_Disable(void);
+PyAPI_FUNC(int) PyGC_IsEnabled(void);
+
+/* Test if a type has a GC head */
+#define PyType_IS_GC(t) PyType_HasFeature((t), Py_TPFLAGS_HAVE_GC)
+
+PyAPI_FUNC(PyVarObject *) _PyObject_GC_Resize(PyVarObject *, Py_ssize_t);
+#define PyObject_GC_Resize(type, op, n) \
+                ( (type *) _PyObject_GC_Resize(_PyVarObject_CAST(op), (n)) )
+
+
+
+PyAPI_FUNC(PyObject *) _PyObject_GC_New(PyTypeObject *);
+PyAPI_FUNC(PyVarObject *) _PyObject_GC_NewVar(PyTypeObject *, Py_ssize_t);
+
+/* Tell the GC to track this object.
+ *
+ * See also private _PyObject_GC_TRACK() macro. */
+PyAPI_FUNC(void) PyObject_GC_Track(void *);
+
+/* Tell the GC to stop tracking this object.
+ *
+ * See also private _PyObject_GC_UNTRACK() macro. */
+PyAPI_FUNC(void) PyObject_GC_UnTrack(void *);
+
+PyAPI_FUNC(void) PyObject_GC_Del(void *);
+
+#define PyObject_GC_New(type, typeobj) \
+    _Py_CAST(type*, _PyObject_GC_New(typeobj))
+#define PyObject_GC_NewVar(type, typeobj, n) \
+    _Py_CAST(type*, _PyObject_GC_NewVar((typeobj), (n)))
+
+PyAPI_FUNC(int) PyObject_GC_IsTracked(PyObject *);
+PyAPI_FUNC(int) PyObject_GC_IsFinalized(PyObject *);
+
+/* Utility macro to help write tp_traverse functions.
+ * To use this macro, the tp_traverse function must name its arguments
+ * "visit" and "arg".  This is intended to keep tp_traverse functions
+ * looking as much alike as possible.
+ */
+#define Py_VISIT(op)                                                    \
+    do {                                                                \
+        if (op) {                                                       \
+            int vret = visit(_PyObject_CAST(op), arg);                  \
+            if (vret)                                                   \
+                return vret;                                            \
+        }                                                               \
+    } while (0)
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_OBJIMPL_H
+#  include "cpython/objimpl.h"
+#  undef Py_CPYTHON_OBJIMPL_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_OBJIMPL_H
diff --git a/Include/opcode.h b/Include/opcode.h
new file mode 100644
index 0000000000000000000000000000000000000000..2619b690019acc370d3d73c7544f3fd1cff99b42
--- /dev/null
+++ b/Include/opcode.h
@@ -0,0 +1,42 @@
+#ifndef Py_OPCODE_H
+#define Py_OPCODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "opcode_ids.h"
+
+
+#define NB_ADD                                   0
+#define NB_AND                                   1
+#define NB_FLOOR_DIVIDE                          2
+#define NB_LSHIFT                                3
+#define NB_MATRIX_MULTIPLY                       4
+#define NB_MULTIPLY                              5
+#define NB_REMAINDER                             6
+#define NB_OR                                    7
+#define NB_POWER                                 8
+#define NB_RSHIFT                                9
+#define NB_SUBTRACT                             10
+#define NB_TRUE_DIVIDE                          11
+#define NB_XOR                                  12
+#define NB_INPLACE_ADD                          13
+#define NB_INPLACE_AND                          14
+#define NB_INPLACE_FLOOR_DIVIDE                 15
+#define NB_INPLACE_LSHIFT                       16
+#define NB_INPLACE_MATRIX_MULTIPLY              17
+#define NB_INPLACE_MULTIPLY                     18
+#define NB_INPLACE_REMAINDER                    19
+#define NB_INPLACE_OR                           20
+#define NB_INPLACE_POWER                        21
+#define NB_INPLACE_RSHIFT                       22
+#define NB_INPLACE_SUBTRACT                     23
+#define NB_INPLACE_TRUE_DIVIDE                  24
+#define NB_INPLACE_XOR                          25
+
+#define NB_OPARG_LAST                           25
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_OPCODE_H */
diff --git a/Include/opcode_ids.h b/Include/opcode_ids.h
new file mode 100644
index 0000000000000000000000000000000000000000..647f7c0ecb1ec83a84c4db190338ac3e60f818d1
--- /dev/null
+++ b/Include/opcode_ids.h
@@ -0,0 +1,244 @@
+// This file is generated by Tools/cases_generator/opcode_id_generator.py
+// from:
+//   Python/bytecodes.c
+// Do not edit!
+
+#ifndef Py_OPCODE_IDS_H
+#define Py_OPCODE_IDS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Instruction opcodes for compiled code */
+#define CACHE                                    0
+#define BEFORE_ASYNC_WITH                        1
+#define BEFORE_WITH                              2
+#define BINARY_OP_INPLACE_ADD_UNICODE            3
+#define BINARY_SLICE                             4
+#define BINARY_SUBSCR                            5
+#define CHECK_EG_MATCH                           6
+#define CHECK_EXC_MATCH                          7
+#define CLEANUP_THROW                            8
+#define DELETE_SUBSCR                            9
+#define END_ASYNC_FOR                           10
+#define END_FOR                                 11
+#define END_SEND                                12
+#define EXIT_INIT_CHECK                         13
+#define FORMAT_SIMPLE                           14
+#define FORMAT_WITH_SPEC                        15
+#define GET_AITER                               16
+#define RESERVED                                17
+#define GET_ANEXT                               18
+#define GET_ITER                                19
+#define GET_LEN                                 20
+#define GET_YIELD_FROM_ITER                     21
+#define INTERPRETER_EXIT                        22
+#define LOAD_ASSERTION_ERROR                    23
+#define LOAD_BUILD_CLASS                        24
+#define LOAD_LOCALS                             25
+#define MAKE_FUNCTION                           26
+#define MATCH_KEYS                              27
+#define MATCH_MAPPING                           28
+#define MATCH_SEQUENCE                          29
+#define NOP                                     30
+#define POP_EXCEPT                              31
+#define POP_TOP                                 32
+#define PUSH_EXC_INFO                           33
+#define PUSH_NULL                               34
+#define RETURN_GENERATOR                        35
+#define RETURN_VALUE                            36
+#define SETUP_ANNOTATIONS                       37
+#define STORE_SLICE                             38
+#define STORE_SUBSCR                            39
+#define TO_BOOL                                 40
+#define UNARY_INVERT                            41
+#define UNARY_NEGATIVE                          42
+#define UNARY_NOT                               43
+#define WITH_EXCEPT_START                       44
+#define BINARY_OP                               45
+#define BUILD_CONST_KEY_MAP                     46
+#define BUILD_LIST                              47
+#define BUILD_MAP                               48
+#define BUILD_SET                               49
+#define BUILD_SLICE                             50
+#define BUILD_STRING                            51
+#define BUILD_TUPLE                             52
+#define CALL                                    53
+#define CALL_FUNCTION_EX                        54
+#define CALL_INTRINSIC_1                        55
+#define CALL_INTRINSIC_2                        56
+#define CALL_KW                                 57
+#define COMPARE_OP                              58
+#define CONTAINS_OP                             59
+#define CONVERT_VALUE                           60
+#define COPY                                    61
+#define COPY_FREE_VARS                          62
+#define DELETE_ATTR                             63
+#define DELETE_DEREF                            64
+#define DELETE_FAST                             65
+#define DELETE_GLOBAL                           66
+#define DELETE_NAME                             67
+#define DICT_MERGE                              68
+#define DICT_UPDATE                             69
+#define ENTER_EXECUTOR                          70
+#define EXTENDED_ARG                            71
+#define FOR_ITER                                72
+#define GET_AWAITABLE                           73
+#define IMPORT_FROM                             74
+#define IMPORT_NAME                             75
+#define IS_OP                                   76
+#define JUMP_BACKWARD                           77
+#define JUMP_BACKWARD_NO_INTERRUPT              78
+#define JUMP_FORWARD                            79
+#define LIST_APPEND                             80
+#define LIST_EXTEND                             81
+#define LOAD_ATTR                               82
+#define LOAD_CONST                              83
+#define LOAD_DEREF                              84
+#define LOAD_FAST                               85
+#define LOAD_FAST_AND_CLEAR                     86
+#define LOAD_FAST_CHECK                         87
+#define LOAD_FAST_LOAD_FAST                     88
+#define LOAD_FROM_DICT_OR_DEREF                 89
+#define LOAD_FROM_DICT_OR_GLOBALS               90
+#define LOAD_GLOBAL                             91
+#define LOAD_NAME                               92
+#define LOAD_SUPER_ATTR                         93
+#define MAKE_CELL                               94
+#define MAP_ADD                                 95
+#define MATCH_CLASS                             96
+#define POP_JUMP_IF_FALSE                       97
+#define POP_JUMP_IF_NONE                        98
+#define POP_JUMP_IF_NOT_NONE                    99
+#define POP_JUMP_IF_TRUE                       100
+#define RAISE_VARARGS                          101
+#define RERAISE                                102
+#define RETURN_CONST                           103
+#define SEND                                   104
+#define SET_ADD                                105
+#define SET_FUNCTION_ATTRIBUTE                 106
+#define SET_UPDATE                             107
+#define STORE_ATTR                             108
+#define STORE_DEREF                            109
+#define STORE_FAST                             110
+#define STORE_FAST_LOAD_FAST                   111
+#define STORE_FAST_STORE_FAST                  112
+#define STORE_GLOBAL                           113
+#define STORE_NAME                             114
+#define SWAP                                   115
+#define UNPACK_EX                              116
+#define UNPACK_SEQUENCE                        117
+#define YIELD_VALUE                            118
+#define RESUME                                 149
+#define BINARY_OP_ADD_FLOAT                    150
+#define BINARY_OP_ADD_INT                      151
+#define BINARY_OP_ADD_UNICODE                  152
+#define BINARY_OP_MULTIPLY_FLOAT               153
+#define BINARY_OP_MULTIPLY_INT                 154
+#define BINARY_OP_SUBTRACT_FLOAT               155
+#define BINARY_OP_SUBTRACT_INT                 156
+#define BINARY_SUBSCR_DICT                     157
+#define BINARY_SUBSCR_GETITEM                  158
+#define BINARY_SUBSCR_LIST_INT                 159
+#define BINARY_SUBSCR_STR_INT                  160
+#define BINARY_SUBSCR_TUPLE_INT                161
+#define CALL_ALLOC_AND_ENTER_INIT              162
+#define CALL_BOUND_METHOD_EXACT_ARGS           163
+#define CALL_BOUND_METHOD_GENERAL              164
+#define CALL_BUILTIN_CLASS                     165
+#define CALL_BUILTIN_FAST                      166
+#define CALL_BUILTIN_FAST_WITH_KEYWORDS        167
+#define CALL_BUILTIN_O                         168
+#define CALL_ISINSTANCE                        169
+#define CALL_LEN                               170
+#define CALL_LIST_APPEND                       171
+#define CALL_METHOD_DESCRIPTOR_FAST            172
+#define CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS 173
+#define CALL_METHOD_DESCRIPTOR_NOARGS          174
+#define CALL_METHOD_DESCRIPTOR_O               175
+#define CALL_NON_PY_GENERAL                    176
+#define CALL_PY_EXACT_ARGS                     177
+#define CALL_PY_GENERAL                        178
+#define CALL_STR_1                             179
+#define CALL_TUPLE_1                           180
+#define CALL_TYPE_1                            181
+#define COMPARE_OP_FLOAT                       182
+#define COMPARE_OP_INT                         183
+#define COMPARE_OP_STR                         184
+#define CONTAINS_OP_DICT                       185
+#define CONTAINS_OP_SET                        186
+#define FOR_ITER_GEN                           187
+#define FOR_ITER_LIST                          188
+#define FOR_ITER_RANGE                         189
+#define FOR_ITER_TUPLE                         190
+#define LOAD_ATTR_CLASS                        191
+#define LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN      192
+#define LOAD_ATTR_INSTANCE_VALUE               193
+#define LOAD_ATTR_METHOD_LAZY_DICT             194
+#define LOAD_ATTR_METHOD_NO_DICT               195
+#define LOAD_ATTR_METHOD_WITH_VALUES           196
+#define LOAD_ATTR_MODULE                       197
+#define LOAD_ATTR_NONDESCRIPTOR_NO_DICT        198
+#define LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES    199
+#define LOAD_ATTR_PROPERTY                     200
+#define LOAD_ATTR_SLOT                         201
+#define LOAD_ATTR_WITH_HINT                    202
+#define LOAD_GLOBAL_BUILTIN                    203
+#define LOAD_GLOBAL_MODULE                     204
+#define LOAD_SUPER_ATTR_ATTR                   205
+#define LOAD_SUPER_ATTR_METHOD                 206
+#define RESUME_CHECK                           207
+#define SEND_GEN                               208
+#define STORE_ATTR_INSTANCE_VALUE              209
+#define STORE_ATTR_SLOT                        210
+#define STORE_ATTR_WITH_HINT                   211
+#define STORE_SUBSCR_DICT                      212
+#define STORE_SUBSCR_LIST_INT                  213
+#define TO_BOOL_ALWAYS_TRUE                    214
+#define TO_BOOL_BOOL                           215
+#define TO_BOOL_INT                            216
+#define TO_BOOL_LIST                           217
+#define TO_BOOL_NONE                           218
+#define TO_BOOL_STR                            219
+#define UNPACK_SEQUENCE_LIST                   220
+#define UNPACK_SEQUENCE_TUPLE                  221
+#define UNPACK_SEQUENCE_TWO_TUPLE              222
+#define INSTRUMENTED_RESUME                    236
+#define INSTRUMENTED_END_FOR                   237
+#define INSTRUMENTED_END_SEND                  238
+#define INSTRUMENTED_RETURN_VALUE              239
+#define INSTRUMENTED_RETURN_CONST              240
+#define INSTRUMENTED_YIELD_VALUE               241
+#define INSTRUMENTED_LOAD_SUPER_ATTR           242
+#define INSTRUMENTED_FOR_ITER                  243
+#define INSTRUMENTED_CALL                      244
+#define INSTRUMENTED_CALL_KW                   245
+#define INSTRUMENTED_CALL_FUNCTION_EX          246
+#define INSTRUMENTED_INSTRUCTION               247
+#define INSTRUMENTED_JUMP_FORWARD              248
+#define INSTRUMENTED_JUMP_BACKWARD             249
+#define INSTRUMENTED_POP_JUMP_IF_TRUE          250
+#define INSTRUMENTED_POP_JUMP_IF_FALSE         251
+#define INSTRUMENTED_POP_JUMP_IF_NONE          252
+#define INSTRUMENTED_POP_JUMP_IF_NOT_NONE      253
+#define INSTRUMENTED_LINE                      254
+#define JUMP                                   256
+#define JUMP_NO_INTERRUPT                      257
+#define LOAD_CLOSURE                           258
+#define LOAD_METHOD                            259
+#define LOAD_SUPER_METHOD                      260
+#define LOAD_ZERO_SUPER_ATTR                   261
+#define LOAD_ZERO_SUPER_METHOD                 262
+#define POP_BLOCK                              263
+#define SETUP_CLEANUP                          264
+#define SETUP_FINALLY                          265
+#define SETUP_WITH                             266
+#define STORE_FAST_MAYBE_NULL                  267
+
+#define HAVE_ARGUMENT                           44
+#define MIN_INSTRUMENTED_OPCODE                236
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_OPCODE_IDS_H */
diff --git a/Include/osdefs.h b/Include/osdefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..2599e87a9d7c4b8e599a88d46b264e2be3a1da7d
--- /dev/null
+++ b/Include/osdefs.h
@@ -0,0 +1,57 @@
+// Operating system dependencies.
+//
+// Define constants:
+//
+// - ALTSEP
+// - DELIM
+// - MAXPATHLEN
+// - SEP
+
+#ifndef Py_OSDEFS_H
+#define Py_OSDEFS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef MS_WINDOWS
+#  define SEP L'\\'
+#  define ALTSEP L'/'
+#  define MAXPATHLEN 256
+#  define DELIM L';'
+#endif
+
+#ifdef __VXWORKS__
+#  define DELIM L';'
+#endif
+
+/* Filename separator */
+#ifndef SEP
+#  define SEP L'/'
+#endif
+
+/* Max pathname length */
+#ifdef __hpux
+#  include <sys/param.h>
+#  include <limits.h>
+#  ifndef PATH_MAX
+#    define PATH_MAX MAXPATHLEN
+#  endif
+#endif
+
+#ifndef MAXPATHLEN
+#  if defined(PATH_MAX) && PATH_MAX > 1024
+#    define MAXPATHLEN PATH_MAX
+#  else
+#    define MAXPATHLEN 1024
+#  endif
+#endif
+
+/* Search path entry delimiter */
+#ifndef DELIM
+#  define DELIM L':'
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_OSDEFS_H
diff --git a/Include/osmodule.h b/Include/osmodule.h
new file mode 100644
index 0000000000000000000000000000000000000000..9095c2fdd3d638140db129937d29830286941819
--- /dev/null
+++ b/Include/osmodule.h
@@ -0,0 +1,17 @@
+
+/* os module interface */
+
+#ifndef Py_OSMODULE_H
+#define Py_OSMODULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+PyAPI_FUNC(PyObject *) PyOS_FSPath(PyObject *path);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_OSMODULE_H */
diff --git a/Include/patchlevel.h b/Include/patchlevel.h
new file mode 100644
index 0000000000000000000000000000000000000000..36b5ac25c63d38eea942737010d4ee7ef5dc9826
--- /dev/null
+++ b/Include/patchlevel.h
@@ -0,0 +1,35 @@
+
+/* Python version identification scheme.
+
+   When the major or minor version changes, the VERSION variable in
+   configure.ac must also be changed.
+
+   There is also (independent) API version information in modsupport.h.
+*/
+
+/* Values for PY_RELEASE_LEVEL */
+#define PY_RELEASE_LEVEL_ALPHA  0xA
+#define PY_RELEASE_LEVEL_BETA   0xB
+#define PY_RELEASE_LEVEL_GAMMA  0xC     /* For release candidates */
+#define PY_RELEASE_LEVEL_FINAL  0xF     /* Serial should be 0 here */
+                                        /* Higher for patch releases */
+
+/* Version parsed out into numeric values */
+/*--start constants--*/
+#define PY_MAJOR_VERSION        3
+#define PY_MINOR_VERSION        13
+#define PY_MICRO_VERSION        7
+#define PY_RELEASE_LEVEL        PY_RELEASE_LEVEL_FINAL
+#define PY_RELEASE_SERIAL       0
+
+/* Version as a string */
+#define PY_VERSION              "3.13.7"
+/*--end constants--*/
+
+/* Version as a single 4-byte hex number, e.g. 0x010502B2 == 1.5.2b2.
+   Use this for numeric comparisons, e.g. #if PY_VERSION_HEX >= ... */
+#define PY_VERSION_HEX ((PY_MAJOR_VERSION << 24) | \
+                        (PY_MINOR_VERSION << 16) | \
+                        (PY_MICRO_VERSION <<  8) | \
+                        (PY_RELEASE_LEVEL <<  4) | \
+                        (PY_RELEASE_SERIAL << 0))
diff --git a/Include/py_curses.h b/Include/py_curses.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e8b16c201f810207c72a259e646e57374038a2f
--- /dev/null
+++ b/Include/py_curses.h
@@ -0,0 +1,117 @@
+
+#ifndef Py_CURSES_H
+#define Py_CURSES_H
+
+#ifdef __APPLE__
+/*
+** On Mac OS X 10.2 [n]curses.h and stdlib.h use different guards
+** against multiple definition of wchar_t.
+*/
+#ifdef _BSD_WCHAR_T_DEFINED_
+#define _WCHAR_T
+#endif
+#endif /* __APPLE__ */
+
+/* On FreeBSD, [n]curses.h and stdlib.h/wchar.h use different guards
+   against multiple definition of wchar_t and wint_t. */
+#if defined(__FreeBSD__) && defined(_XOPEN_SOURCE_EXTENDED)
+# ifndef __wchar_t
+#   define __wchar_t
+# endif
+# ifndef __wint_t
+#   define __wint_t
+# endif
+#endif
+
+#if defined(WINDOW_HAS_FLAGS) && defined(__APPLE__)
+/* gh-109617, gh-115383: we can rely on the default value for NCURSES_OPAQUE on
+   most platforms, but not on macOS. This is because, starting with Xcode 15,
+   Apple-provided ncurses.h comes from ncurses 6 (which defaults to opaque
+   structs) but can still be linked to older versions of ncurses dynamic
+   libraries which don't provide functions such as is_pad() to deal with opaque
+   structs. Setting NCURSES_OPAQUE to 0 is harmless in all ncurses releases to
+   this date (provided that a thread-safe implementation is not required), but
+   this might change in the future. This fix might become irrelevant once
+   support for macOS 13 or earlier is dropped. */
+#define NCURSES_OPAQUE 0
+#endif
+
+#if defined(HAVE_NCURSESW_NCURSES_H)
+#  include <ncursesw/ncurses.h>
+#elif defined(HAVE_NCURSESW_CURSES_H)
+#  include <ncursesw/curses.h>
+#elif defined(HAVE_NCURSES_NCURSES_H)
+#  include <ncurses/ncurses.h>
+#elif defined(HAVE_NCURSES_CURSES_H)
+#  include <ncurses/curses.h>
+#elif defined(HAVE_NCURSES_H)
+#  include <ncurses.h>
+#elif defined(HAVE_CURSES_H)
+#  include <curses.h>
+#endif
+
+#ifdef NCURSES_VERSION
+/* configure was checking <curses.h>, but we will
+   use <ncurses.h>, which has some or all these features. */
+#if !defined(WINDOW_HAS_FLAGS) && \
+    (NCURSES_VERSION_PATCH+0 < 20070303 || !(NCURSES_OPAQUE+0))
+/* the WINDOW flags field was always accessible in ncurses prior to 20070303;
+   after that, it depends on the value of NCURSES_OPAQUE. */
+#define WINDOW_HAS_FLAGS 1
+#endif
+#if !defined(HAVE_CURSES_IS_PAD) && NCURSES_VERSION_PATCH+0 >= 20090906
+#define HAVE_CURSES_IS_PAD 1
+#endif
+#ifndef MVWDELCH_IS_EXPRESSION
+#define MVWDELCH_IS_EXPRESSION 1
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PyCurses_API_pointers 4
+
+/* Type declarations */
+
+typedef struct PyCursesWindowObject {
+    PyObject_HEAD
+    WINDOW *win;
+    char *encoding;
+    struct PyCursesWindowObject *orig;
+} PyCursesWindowObject;
+
+#define PyCursesWindow_Check(v) Py_IS_TYPE((v), &PyCursesWindow_Type)
+
+#define PyCurses_CAPSULE_NAME "_curses._C_API"
+
+
+#ifdef CURSES_MODULE
+/* This section is used when compiling _cursesmodule.c */
+
+#else
+/* This section is used in modules that use the _cursesmodule API */
+
+static void **PyCurses_API;
+
+#define PyCursesWindow_Type (*_PyType_CAST(PyCurses_API[0]))
+#define PyCursesSetupTermCalled  {if (! ((int (*)(void))PyCurses_API[1]) () ) return NULL;}
+#define PyCursesInitialised      {if (! ((int (*)(void))PyCurses_API[2]) () ) return NULL;}
+#define PyCursesInitialisedColor {if (! ((int (*)(void))PyCurses_API[3]) () ) return NULL;}
+
+#define import_curses() \
+    PyCurses_API = (void **)PyCapsule_Import(PyCurses_CAPSULE_NAME, 1);
+
+#endif
+
+/* general error messages */
+static const char catchall_ERR[]  = "curses function returned ERR";
+static const char catchall_NULL[] = "curses function returned NULL";
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(Py_CURSES_H) */
+
diff --git a/Include/pyatomic.h b/Include/pyatomic.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ce2c81cf5251a96b8fcaf60f0feba5b49018a55
--- /dev/null
+++ b/Include/pyatomic.h
@@ -0,0 +1,16 @@
+#ifndef Py_ATOMIC_H
+#define Py_ATOMIC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_ATOMIC_H
+#  include "cpython/pyatomic.h"
+#  undef Py_CPYTHON_ATOMIC_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif  /* !Py_ATOMIC_H */
diff --git a/Include/pybuffer.h b/Include/pybuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca1c6058d9052c53588dafddc229a1ef13445ca4
--- /dev/null
+++ b/Include/pybuffer.h
@@ -0,0 +1,145 @@
+/* Public Py_buffer API */
+
+#ifndef Py_BUFFER_H
+#define Py_BUFFER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030b0000
+
+/* === New Buffer API ============================================
+ * Limited API and stable ABI since Python 3.11
+ *
+ * Py_buffer struct layout and size is now part of the stable abi3. The
+ * struct layout and size must not be changed in any way, as it would
+ * break the ABI.
+ *
+ */
+
+typedef struct {
+    void *buf;
+    PyObject *obj;        /* owned reference */
+    Py_ssize_t len;
+    Py_ssize_t itemsize;  /* This is Py_ssize_t so it can be
+                             pointed to by strides in simple case.*/
+    int readonly;
+    int ndim;
+    char *format;
+    Py_ssize_t *shape;
+    Py_ssize_t *strides;
+    Py_ssize_t *suboffsets;
+    void *internal;
+} Py_buffer;
+
+typedef int (*getbufferproc)(PyObject *, Py_buffer *, int);
+typedef void (*releasebufferproc)(PyObject *, Py_buffer *);
+
+/* Return 1 if the getbuffer function is available, otherwise return 0. */
+PyAPI_FUNC(int) PyObject_CheckBuffer(PyObject *obj);
+
+/* This is a C-API version of the getbuffer function call.  It checks
+   to make sure object has the required function pointer and issues the
+   call.
+
+   Returns -1 and raises an error on failure and returns 0 on success. */
+PyAPI_FUNC(int) PyObject_GetBuffer(PyObject *obj, Py_buffer *view,
+                                   int flags);
+
+/* Get the memory area pointed to by the indices for the buffer given.
+   Note that view->ndim is the assumed size of indices. */
+PyAPI_FUNC(void *) PyBuffer_GetPointer(const Py_buffer *view, const Py_ssize_t *indices);
+
+/* Return the implied itemsize of the data-format area from a
+   struct-style description. */
+PyAPI_FUNC(Py_ssize_t) PyBuffer_SizeFromFormat(const char *format);
+
+/* Implementation in memoryobject.c */
+PyAPI_FUNC(int) PyBuffer_ToContiguous(void *buf, const Py_buffer *view,
+                                      Py_ssize_t len, char order);
+
+PyAPI_FUNC(int) PyBuffer_FromContiguous(const Py_buffer *view, const void *buf,
+                                        Py_ssize_t len, char order);
+
+/* Copy len bytes of data from the contiguous chunk of memory
+   pointed to by buf into the buffer exported by obj.  Return
+   0 on success and return -1 and raise a PyBuffer_Error on
+   error (i.e. the object does not have a buffer interface or
+   it is not working).
+
+   If fort is 'F', then if the object is multi-dimensional,
+   then the data will be copied into the array in
+   Fortran-style (first dimension varies the fastest).  If
+   fort is 'C', then the data will be copied into the array
+   in C-style (last dimension varies the fastest).  If fort
+   is 'A', then it does not matter and the copy will be made
+   in whatever way is more efficient. */
+PyAPI_FUNC(int) PyObject_CopyData(PyObject *dest, PyObject *src);
+
+/* Copy the data from the src buffer to the buffer of destination. */
+PyAPI_FUNC(int) PyBuffer_IsContiguous(const Py_buffer *view, char fort);
+
+/*Fill the strides array with byte-strides of a contiguous
+  (Fortran-style if fort is 'F' or C-style otherwise)
+  array of the given shape with the given number of bytes
+  per element. */
+PyAPI_FUNC(void) PyBuffer_FillContiguousStrides(int ndims,
+                                               Py_ssize_t *shape,
+                                               Py_ssize_t *strides,
+                                               int itemsize,
+                                               char fort);
+
+/* Fills in a buffer-info structure correctly for an exporter
+   that can only share a contiguous chunk of memory of
+   "unsigned bytes" of the given length.
+
+   Returns 0 on success and -1 (with raising an error) on error. */
+PyAPI_FUNC(int) PyBuffer_FillInfo(Py_buffer *view, PyObject *o, void *buf,
+                                  Py_ssize_t len, int readonly,
+                                  int flags);
+
+/* Releases a Py_buffer obtained from getbuffer ParseTuple's "s*". */
+PyAPI_FUNC(void) PyBuffer_Release(Py_buffer *view);
+
+/* Maximum number of dimensions */
+#define PyBUF_MAX_NDIM 64
+
+/* Flags for getting buffers. Keep these in sync with inspect.BufferFlags. */
+#define PyBUF_SIMPLE 0
+#define PyBUF_WRITABLE 0x0001
+
+#ifndef Py_LIMITED_API
+/*  we used to include an E, backwards compatible alias */
+#define PyBUF_WRITEABLE PyBUF_WRITABLE
+#endif
+
+#define PyBUF_FORMAT 0x0004
+#define PyBUF_ND 0x0008
+#define PyBUF_STRIDES (0x0010 | PyBUF_ND)
+#define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES)
+#define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES)
+#define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES)
+#define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES)
+
+#define PyBUF_CONTIG (PyBUF_ND | PyBUF_WRITABLE)
+#define PyBUF_CONTIG_RO (PyBUF_ND)
+
+#define PyBUF_STRIDED (PyBUF_STRIDES | PyBUF_WRITABLE)
+#define PyBUF_STRIDED_RO (PyBUF_STRIDES)
+
+#define PyBUF_RECORDS (PyBUF_STRIDES | PyBUF_WRITABLE | PyBUF_FORMAT)
+#define PyBUF_RECORDS_RO (PyBUF_STRIDES | PyBUF_FORMAT)
+
+#define PyBUF_FULL (PyBUF_INDIRECT | PyBUF_WRITABLE | PyBUF_FORMAT)
+#define PyBUF_FULL_RO (PyBUF_INDIRECT | PyBUF_FORMAT)
+
+
+#define PyBUF_READ  0x100
+#define PyBUF_WRITE 0x200
+
+#endif /* !Py_LIMITED_API || Py_LIMITED_API >= 3.11 */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* Py_BUFFER_H */
diff --git a/Include/pycapsule.h b/Include/pycapsule.h
new file mode 100644
index 0000000000000000000000000000000000000000..666b9f8673967051856b1b74f5aca5ce95f620df
--- /dev/null
+++ b/Include/pycapsule.h
@@ -0,0 +1,58 @@
+
+/* Capsule objects let you wrap a C "void *" pointer in a Python
+   object.  They're a way of passing data through the Python interpreter
+   without creating your own custom type.
+
+   Capsules are used for communication between extension modules.
+   They provide a way for an extension module to export a C interface
+   to other extension modules, so that extension modules can use the
+   Python import mechanism to link to one another.
+
+   For more information, please see "c-api/capsule.html" in the
+   documentation.
+*/
+
+#ifndef Py_CAPSULE_H
+#define Py_CAPSULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyCapsule_Type;
+
+typedef void (*PyCapsule_Destructor)(PyObject *);
+
+#define PyCapsule_CheckExact(op) Py_IS_TYPE((op), &PyCapsule_Type)
+
+
+PyAPI_FUNC(PyObject *) PyCapsule_New(
+    void *pointer,
+    const char *name,
+    PyCapsule_Destructor destructor);
+
+PyAPI_FUNC(void *) PyCapsule_GetPointer(PyObject *capsule, const char *name);
+
+PyAPI_FUNC(PyCapsule_Destructor) PyCapsule_GetDestructor(PyObject *capsule);
+
+PyAPI_FUNC(const char *) PyCapsule_GetName(PyObject *capsule);
+
+PyAPI_FUNC(void *) PyCapsule_GetContext(PyObject *capsule);
+
+PyAPI_FUNC(int) PyCapsule_IsValid(PyObject *capsule, const char *name);
+
+PyAPI_FUNC(int) PyCapsule_SetPointer(PyObject *capsule, void *pointer);
+
+PyAPI_FUNC(int) PyCapsule_SetDestructor(PyObject *capsule, PyCapsule_Destructor destructor);
+
+PyAPI_FUNC(int) PyCapsule_SetName(PyObject *capsule, const char *name);
+
+PyAPI_FUNC(int) PyCapsule_SetContext(PyObject *capsule, void *context);
+
+PyAPI_FUNC(void *) PyCapsule_Import(
+    const char *name,           /* UTF-8 encoded string */
+    int no_block);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CAPSULE_H */
diff --git a/Include/pyconfig.h b/Include/pyconfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..424421f6ff16476ebc96e0ec164123ed9913a533
--- /dev/null
+++ b/Include/pyconfig.h
@@ -0,0 +1,759 @@
+#ifndef Py_CONFIG_H
+#define Py_CONFIG_H
+
+/* pyconfig.h.  NOT Generated automatically by configure.
+
+This is a manually maintained version used for the Watcom,
+Borland and Microsoft Visual C++ compilers.  It is a
+standard part of the Python distribution.
+
+WINDOWS DEFINES:
+The code specific to Windows should be wrapped around one of
+the following #defines
+
+MS_WIN64 - Code specific to the MS Win64 API
+MS_WIN32 - Code specific to the MS Win32 (and Win64) API (obsolete, this covers all supported APIs)
+MS_WINDOWS - Code specific to Windows, but all versions.
+Py_ENABLE_SHARED - Code if the Python core is built as a DLL.
+
+Also note that neither "_M_IX86" or "_MSC_VER" should be used for
+any purpose other than "Windows Intel x86 specific" and "Microsoft
+compiler specific".  Therefore, these should be very rare.
+
+
+NOTE: The following symbols are deprecated:
+NT, USE_DL_EXPORT, USE_DL_IMPORT, DL_EXPORT, DL_IMPORT
+MS_CORE_DLL.
+
+WIN32 is still required for the locale module.
+
+*/
+
+/* Deprecated USE_DL_EXPORT macro - please use Py_BUILD_CORE */
+#ifdef USE_DL_EXPORT
+#       define Py_BUILD_CORE
+#endif /* USE_DL_EXPORT */
+
+/* Visual Studio 2005 introduces deprecation warnings for
+   "insecure" and POSIX functions. The insecure functions should
+   be replaced by *_s versions (according to Microsoft); the
+   POSIX functions by _* versions (which, according to Microsoft,
+   would be ISO C conforming). Neither renaming is feasible, so
+   we just silence the warnings. */
+
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE 1
+#endif
+#ifndef _CRT_NONSTDC_NO_DEPRECATE
+#define _CRT_NONSTDC_NO_DEPRECATE 1
+#endif
+
+#define HAVE_IO_H
+#define HAVE_SYS_UTIME_H
+#define HAVE_TEMPNAM
+#define HAVE_TMPFILE
+#define HAVE_TMPNAM
+#define HAVE_CLOCK
+#define HAVE_STRERROR
+
+#include <io.h>
+
+#define HAVE_STRFTIME
+#define DONT_HAVE_SIG_ALARM
+#define DONT_HAVE_SIG_PAUSE
+#define LONG_BIT        32
+#define WORD_BIT 32
+
+#define MS_WIN32 /* only support win32 and greater. */
+#define MS_WINDOWS
+#define NT_THREADS
+#define WITH_THREAD
+#ifndef NETSCAPE_PI
+#define USE_SOCKET
+#endif
+
+#if defined(Py_BUILD_CORE) || defined(Py_BUILD_CORE_BUILTIN) || defined(Py_BUILD_CORE_MODULE)
+#include <winapifamily.h>
+
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define MS_WINDOWS_DESKTOP
+#endif
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#define MS_WINDOWS_APP
+#endif
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_SYSTEM)
+#define MS_WINDOWS_SYSTEM
+#endif
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_GAMES)
+#define MS_WINDOWS_GAMES
+#endif
+
+/* Define to 1 if you support windows console io */
+#if defined(MS_WINDOWS_DESKTOP) || defined(MS_WINDOWS_APP) || defined(MS_WINDOWS_SYSTEM)
+#define HAVE_WINDOWS_CONSOLE_IO 1
+#endif
+#endif /* Py_BUILD_CORE || Py_BUILD_CORE_BUILTIN || Py_BUILD_CORE_MODULE */
+
+/* Define to 1 if you want to disable the GIL */
+/* Uncomment the definition for free-threaded builds, or define it manually
+ * when compiling extension modules. Note that we test with #ifdef, so
+ * defining as 0 will still disable the GIL. */
+#ifndef Py_GIL_DISABLED
+/* #define Py_GIL_DISABLED 1 */
+#endif
+
+/* Compiler specific defines */
+
+/* ------------------------------------------------------------------------*/
+/* Microsoft C defines _MSC_VER, as does clang-cl.exe */
+#ifdef _MSC_VER
+
+/* We want COMPILER to expand to a string containing _MSC_VER's *value*.
+ * This is horridly tricky, because the stringization operator only works
+ * on macro arguments, and doesn't evaluate macros passed *as* arguments.
+ */
+#define _Py_PASTE_VERSION(SUFFIX) \
+        ("[MSC v." _Py_STRINGIZE(_MSC_VER) " " SUFFIX "]")
+/* e.g., this produces, after compile-time string catenation,
+ *      ("[MSC v.1900 64 bit (Intel)]")
+ *
+ * _Py_STRINGIZE(_MSC_VER) expands to
+ * _Py_STRINGIZE1(_MSC_VER) and this second macro call is scanned
+ *      again for macros and so further expands to
+ * _Py_STRINGIZE1(1900) which then expands to
+ * "1900"
+ */
+#define _Py_STRINGIZE(X) _Py_STRINGIZE1(X)
+#define _Py_STRINGIZE1(X) #X
+
+/* MSVC defines _WINxx to differentiate the windows platform types
+
+   Note that for compatibility reasons _WIN32 is defined on Win32
+   *and* on Win64. For the same reasons, in Python, MS_WIN32 is
+   defined on Win32 *and* Win64. Win32 only code must therefore be
+   guarded as follows:
+        #if defined(MS_WIN32) && !defined(MS_WIN64)
+*/
+#ifdef _WIN64
+#define MS_WIN64
+#endif
+
+/* set the COMPILER and support tier
+ *
+ * win_amd64 MSVC (x86_64-pc-windows-msvc): 1
+ * win32 MSVC (i686-pc-windows-msvc): 1
+ * win_arm64 MSVC (aarch64-pc-windows-msvc): 3
+ * other archs and ICC: 0
+ */
+#ifdef MS_WIN64
+#if defined(_M_X64) || defined(_M_AMD64)
+#if defined(__clang__)
+#define COMPILER ("[Clang " __clang_version__ "] 64 bit (AMD64) with MSC v." _Py_STRINGIZE(_MSC_VER) " CRT]")
+#define PY_SUPPORT_TIER 0
+#elif defined(__INTEL_COMPILER)
+#define COMPILER ("[ICC v." _Py_STRINGIZE(__INTEL_COMPILER) " 64 bit (amd64) with MSC v." _Py_STRINGIZE(_MSC_VER) " CRT]")
+#define PY_SUPPORT_TIER 0
+#else
+#define COMPILER _Py_PASTE_VERSION("64 bit (AMD64)")
+#define PY_SUPPORT_TIER 1
+#endif /* __clang__ */
+#define PYD_PLATFORM_TAG "win_amd64"
+#elif defined(_M_ARM64)
+#define COMPILER _Py_PASTE_VERSION("64 bit (ARM64)")
+#define PY_SUPPORT_TIER 3
+#define PYD_PLATFORM_TAG "win_arm64"
+#else
+#define COMPILER _Py_PASTE_VERSION("64 bit (Unknown)")
+#define PY_SUPPORT_TIER 0
+#endif
+#endif /* MS_WIN64 */
+
+/* set the version macros for the windows headers */
+/* Python 3.12+ requires Windows 8.1 or greater */
+#define Py_WINVER 0x0603 /* _WIN32_WINNT_WINBLUE (8.1) */
+#define Py_NTDDI NTDDI_WINBLUE
+
+/* We only set these values when building Python - we don't want to force
+   these values on extensions, as that will affect the prototypes and
+   structures exposed in the Windows headers. Even when building Python, we
+   allow a single source file to override this - they may need access to
+   structures etc so it can optionally use new Windows features if it
+   determines at runtime they are available.
+*/
+#if defined(Py_BUILD_CORE) || defined(Py_BUILD_CORE_BUILTIN) || defined(Py_BUILD_CORE_MODULE)
+#ifndef NTDDI_VERSION
+#define NTDDI_VERSION Py_NTDDI
+#endif
+#ifndef WINVER
+#define WINVER Py_WINVER
+#endif
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT Py_WINVER
+#endif
+#endif
+
+/* _W64 is not defined for VC6 or eVC4 */
+#ifndef _W64
+#define _W64
+#endif
+
+/* Define like size_t, omitting the "unsigned" */
+#ifdef MS_WIN64
+typedef __int64 Py_ssize_t;
+#   define PY_SSIZE_T_MAX LLONG_MAX
+#else
+typedef _W64 int Py_ssize_t;
+#   define PY_SSIZE_T_MAX INT_MAX
+#endif
+#define HAVE_PY_SSIZE_T 1
+
+#if defined(MS_WIN32) && !defined(MS_WIN64)
+#if defined(_M_IX86)
+#if defined(__clang__)
+#define COMPILER ("[Clang " __clang_version__ "] 32 bit (Intel) with MSC v." _Py_STRINGIZE(_MSC_VER) " CRT]")
+#define PY_SUPPORT_TIER 0
+#elif defined(__INTEL_COMPILER)
+#define COMPILER ("[ICC v." _Py_STRINGIZE(__INTEL_COMPILER) " 32 bit (Intel) with MSC v." _Py_STRINGIZE(_MSC_VER) " CRT]")
+#define PY_SUPPORT_TIER 0
+#else
+#define COMPILER _Py_PASTE_VERSION("32 bit (Intel)")
+#define PY_SUPPORT_TIER 1
+#endif /* __clang__ */
+#define PYD_PLATFORM_TAG "win32"
+#elif defined(_M_ARM)
+#define COMPILER _Py_PASTE_VERSION("32 bit (ARM)")
+#define PYD_PLATFORM_TAG "win_arm32"
+#define PY_SUPPORT_TIER 0
+#else
+#define COMPILER _Py_PASTE_VERSION("32 bit (Unknown)")
+#define PY_SUPPORT_TIER 0
+#endif
+#endif /* MS_WIN32 && !MS_WIN64 */
+
+typedef int pid_t;
+
+/* define some ANSI types that are not defined in earlier Win headers */
+#if _MSC_VER >= 1200
+/* This file only exists in VC 6.0 or higher */
+#include <basetsd.h>
+#endif
+
+#endif /* _MSC_VER */
+
+/* ------------------------------------------------------------------------*/
+/* mingw and mingw-w64 define __MINGW32__ */
+#ifdef __MINGW32__
+
+#ifdef _WIN64
+#define MS_WIN64
+#endif
+
+#endif /* __MINGW32__*/
+
+/* ------------------------------------------------------------------------*/
+/* egcs/gnu-win32 defines __GNUC__ and _WIN32 */
+#if defined(__GNUC__) && defined(_WIN32)
+/* XXX These defines are likely incomplete, but should be easy to fix.
+   They should be complete enough to build extension modules. */
+/* Suggested by Rene Liebscher <R.Liebscher@gmx.de> to avoid a GCC 2.91.*
+   bug that requires structure imports.  More recent versions of the
+   compiler don't exhibit this bug.
+*/
+#if (__GNUC__==2) && (__GNUC_MINOR__<=91)
+#warning "Please use an up-to-date version of gcc! (>2.91 recommended)"
+#endif
+
+#define COMPILER "[gcc]"
+#define PY_LONG_LONG long long
+#define PY_LLONG_MIN LLONG_MIN
+#define PY_LLONG_MAX LLONG_MAX
+#define PY_ULLONG_MAX ULLONG_MAX
+#endif /* GNUC */
+
+/* ------------------------------------------------------------------------*/
+/* lcc-win32 defines __LCC__ */
+#if defined(__LCC__)
+/* XXX These defines are likely incomplete, but should be easy to fix.
+   They should be complete enough to build extension modules. */
+
+#define COMPILER "[lcc-win32]"
+typedef int pid_t;
+/* __declspec() is supported here too - do nothing to get the defaults */
+
+#endif /* LCC */
+
+/* ------------------------------------------------------------------------*/
+/* End of compilers - finish up */
+
+#ifndef NO_STDIO_H
+#       include <stdio.h>
+#endif
+
+/* 64 bit ints are usually spelt __int64 unless compiler has overridden */
+#ifndef PY_LONG_LONG
+#       define PY_LONG_LONG __int64
+#       define PY_LLONG_MAX _I64_MAX
+#       define PY_LLONG_MIN _I64_MIN
+#       define PY_ULLONG_MAX _UI64_MAX
+#endif
+
+/* For Windows the Python core is in a DLL by default.  Test
+Py_NO_ENABLE_SHARED to find out.  Also support MS_NO_COREDLL for b/w compat */
+#if !defined(MS_NO_COREDLL) && !defined(Py_NO_ENABLE_SHARED)
+#       define Py_ENABLE_SHARED 1 /* standard symbol for shared library */
+#       define MS_COREDLL       /* deprecated old symbol */
+#endif /* !MS_NO_COREDLL && ... */
+
+/*  All windows compilers that use this header support __declspec */
+#define HAVE_DECLSPEC_DLL
+
+/* For an MSVC DLL, we can nominate the .lib files used by extensions */
+#ifdef MS_COREDLL
+#       if !defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_BUILTIN)
+                /* not building the core - must be an ext */
+#               if defined(_MSC_VER)
+                        /* So MSVC users need not specify the .lib
+                        file in their Makefile */
+#                       if defined(Py_GIL_DISABLED)
+#                       if defined(_DEBUG)
+#                               pragma comment(lib,"python313t_d.lib")
+#                       elif defined(Py_LIMITED_API)
+#                               pragma comment(lib,"python3t.lib")
+#                       else
+#                               pragma comment(lib,"python313t.lib")
+#                       endif /* _DEBUG */
+#                       else /* Py_GIL_DISABLED */
+#                       if defined(_DEBUG)
+#                               pragma comment(lib,"python313_d.lib")
+#                       elif defined(Py_LIMITED_API)
+#                               pragma comment(lib,"python3.lib")
+#                       else
+#                               pragma comment(lib,"python313.lib")
+#                       endif /* _DEBUG */
+#                       endif /* Py_GIL_DISABLED */
+#               endif /* _MSC_VER */
+#       endif /* Py_BUILD_CORE */
+#endif /* MS_COREDLL */
+
+#ifdef MS_WIN64
+/* maintain "win32" sys.platform for backward compatibility of Python code,
+   the Win64 API should be close enough to the Win32 API to make this
+   preferable */
+#       define PLATFORM "win32"
+#       define SIZEOF_VOID_P 8
+#       define SIZEOF_TIME_T 8
+#       define SIZEOF_OFF_T 4
+#       define SIZEOF_FPOS_T 8
+#       define SIZEOF_HKEY 8
+#       define SIZEOF_SIZE_T 8
+#       define ALIGNOF_SIZE_T 8
+#       define ALIGNOF_MAX_ALIGN_T 8
+/* configure.ac defines HAVE_LARGEFILE_SUPPORT iff
+   sizeof(off_t) > sizeof(long), and sizeof(long long) >= sizeof(off_t).
+   On Win64 the second condition is not true, but if fpos_t replaces off_t
+   then this is true. The uses of HAVE_LARGEFILE_SUPPORT imply that Win64
+   should define this. */
+#       define HAVE_LARGEFILE_SUPPORT
+#elif defined(MS_WIN32)
+#       define PLATFORM "win32"
+#       define HAVE_LARGEFILE_SUPPORT
+#       define SIZEOF_VOID_P 4
+#       define SIZEOF_OFF_T 4
+#       define SIZEOF_FPOS_T 8
+#       define SIZEOF_HKEY 4
+#       define SIZEOF_SIZE_T 4
+#       define ALIGNOF_SIZE_T 4
+        /* MS VS2005 changes time_t to a 64-bit type on all platforms */
+#       if defined(_MSC_VER) && _MSC_VER >= 1400
+#       define SIZEOF_TIME_T 8
+#       else
+#       define SIZEOF_TIME_T 4
+#       endif
+#       define ALIGNOF_MAX_ALIGN_T 8
+#endif
+
+#ifdef _DEBUG
+#       define Py_DEBUG
+#endif
+
+
+#ifdef MS_WIN32
+
+#define SIZEOF_SHORT 2
+#define SIZEOF_INT 4
+#define SIZEOF_LONG 4
+#define ALIGNOF_LONG 4
+#define SIZEOF_LONG_LONG 8
+#define SIZEOF_DOUBLE 8
+#define SIZEOF_FLOAT 4
+
+/* VC 7.1 has them and VC 6.0 does not.  VC 6.0 has a version number of 1200.
+   Microsoft eMbedded Visual C++ 4.0 has a version number of 1201 and doesn't
+   define these.
+   If some compiler does not provide them, modify the #if appropriately. */
+#if defined(_MSC_VER)
+#if _MSC_VER > 1300
+#define HAVE_UINTPTR_T 1
+#define HAVE_INTPTR_T 1
+#else
+/* VC6, VS 2002 and eVC4 don't support the C99 LL suffix for 64-bit integer literals */
+#define Py_LL(x) x##I64
+#endif  /* _MSC_VER > 1300  */
+#endif  /* _MSC_VER */
+
+#endif
+
+/* define signed and unsigned exact-width 32-bit and 64-bit types, used in the
+   implementation of Python integers. */
+#define PY_UINT32_T uint32_t
+#define PY_UINT64_T uint64_t
+#define PY_INT32_T int32_t
+#define PY_INT64_T int64_t
+
+/* Fairly standard from here! */
+
+/* Define if on AIX 3.
+   System headers sometimes define this.
+   We just want to avoid a redefinition error message.  */
+#ifndef _ALL_SOURCE
+/* #undef _ALL_SOURCE */
+#endif
+
+/* Define to empty if the keyword does not work.  */
+/* #define const  */
+
+/* Define to 1 if you have the <conio.h> header file. */
+#define HAVE_CONIO_H 1
+
+/* Define to 1 if you have the <direct.h> header file. */
+#define HAVE_DIRECT_H 1
+
+/* Define to 1 if you have the declaration of `tzname', and to 0 if you don't.
+   */
+#define HAVE_DECL_TZNAME 1
+
+/* Define if you have dirent.h.  */
+/* #define DIRENT 1 */
+
+/* Define to the type of elements in the array set by `getgroups'.
+   Usually this is either `int' or `gid_t'.  */
+/* #undef GETGROUPS_T */
+
+/* Define to `int' if <sys/types.h> doesn't define.  */
+/* #undef gid_t */
+
+/* Define if your struct tm has tm_zone.  */
+/* #undef HAVE_TM_ZONE */
+
+/* Define if you don't have tm_zone but do have the external array
+   tzname.  */
+#define HAVE_TZNAME
+
+/* Define to `int' if <sys/types.h> doesn't define.  */
+/* #undef mode_t */
+
+/* Define if you don't have dirent.h, but have ndir.h.  */
+/* #undef NDIR */
+
+/* Define to `long' if <sys/types.h> doesn't define.  */
+/* #undef off_t */
+
+/* Define to `int' if <sys/types.h> doesn't define.  */
+/* #undef pid_t */
+
+/* Define if the system does not provide POSIX.1 features except
+   with this defined.  */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define if you need to in order for stat and other things to work.  */
+/* #undef _POSIX_SOURCE */
+
+/* Define as the return type of signal handlers (int or void).  */
+#define RETSIGTYPE void
+
+/* Define to `unsigned' if <sys/types.h> doesn't define.  */
+/* #undef size_t */
+
+/* Define if you have the ANSI C header files.  */
+#define STDC_HEADERS 1
+
+/* Define if you don't have dirent.h, but have sys/dir.h.  */
+/* #undef SYSDIR */
+
+/* Define if you don't have dirent.h, but have sys/ndir.h.  */
+/* #undef SYSNDIR */
+
+/* Define if you can safely include both <sys/time.h> and <time.h>.  */
+/* #undef TIME_WITH_SYS_TIME */
+
+/* Define if your <sys/time.h> declares struct tm.  */
+/* #define TM_IN_SYS_TIME 1 */
+
+/* Define to `int' if <sys/types.h> doesn't define.  */
+/* #undef uid_t */
+
+/* Define if the closedir function returns void instead of int.  */
+/* #undef VOID_CLOSEDIR */
+
+/* Define if getpgrp() must be called as getpgrp(0)
+   and (consequently) setpgrp() as setpgrp(0, 0). */
+/* #undef GETPGRP_HAVE_ARGS */
+
+/* Define this if your time.h defines altzone */
+/* #define HAVE_ALTZONE */
+
+/* Define if you have the putenv function.  */
+#define HAVE_PUTENV
+
+/* Define if your compiler supports function prototypes */
+#define HAVE_PROTOTYPES
+
+/* Define if  you can safely include both <sys/select.h> and <sys/time.h>
+   (which you can't on SCO ODT 3.0). */
+/* #undef SYS_SELECT_WITH_SYS_TIME */
+
+/* Define if you want build the _decimal module using a coroutine-local rather
+   than a thread-local context */
+#define WITH_DECIMAL_CONTEXTVAR 1
+
+/* Define if you want documentation strings in extension modules */
+#define WITH_DOC_STRINGS 1
+
+/* Define if you want to compile in rudimentary thread support */
+/* #undef WITH_THREAD */
+
+/* Define if you want to use the GNU readline library */
+/* #define WITH_READLINE 1 */
+
+/* Use Python's own small-block memory-allocator. */
+#define WITH_PYMALLOC 1
+
+/* Define if you want to compile in mimalloc memory allocator. */
+#define WITH_MIMALLOC 1
+
+/* Define if you want to compile in object freelists optimization */
+#define WITH_FREELISTS 1
+
+/* Define if you have clock.  */
+/* #define HAVE_CLOCK */
+
+/* Define when any dynamic module loading is enabled */
+#define HAVE_DYNAMIC_LOADING
+
+/* Define if you have ftime.  */
+#define HAVE_FTIME
+
+/* Define if you have getpeername.  */
+#define HAVE_GETPEERNAME
+
+/* Define if you have getpgrp.  */
+/* #undef HAVE_GETPGRP */
+
+/* Define if you have getpid.  */
+#define HAVE_GETPID
+
+/* Define if you have gettimeofday.  */
+/* #undef HAVE_GETTIMEOFDAY */
+
+/* Define if you have getwd.  */
+/* #undef HAVE_GETWD */
+
+/* Define if you have lstat.  */
+/* #undef HAVE_LSTAT */
+
+/* Define if you have the mktime function.  */
+#define HAVE_MKTIME
+
+/* Define if you have nice.  */
+/* #undef HAVE_NICE */
+
+/* Define if you have readlink.  */
+/* #undef HAVE_READLINK */
+
+/* Define if you have setpgid.  */
+/* #undef HAVE_SETPGID */
+
+/* Define if you have setpgrp.  */
+/* #undef HAVE_SETPGRP */
+
+/* Define if you have setsid.  */
+/* #undef HAVE_SETSID */
+
+/* Define if you have setvbuf.  */
+#define HAVE_SETVBUF
+
+/* Define if you have siginterrupt.  */
+/* #undef HAVE_SIGINTERRUPT */
+
+/* Define to 1 if you have the `shutdown' function. */
+#define HAVE_SHUTDOWN 1
+
+/* Define if you have symlink.  */
+/* #undef HAVE_SYMLINK */
+
+/* Define if you have tcgetpgrp.  */
+/* #undef HAVE_TCGETPGRP */
+
+/* Define if you have tcsetpgrp.  */
+/* #undef HAVE_TCSETPGRP */
+
+/* Define if you have times.  */
+/* #undef HAVE_TIMES */
+
+/* Define to 1 if you have the `umask' function. */
+#define HAVE_UMASK 1
+
+/* Define if you have uname.  */
+/* #undef HAVE_UNAME */
+
+/* Define if you have waitpid.  */
+/* #undef HAVE_WAITPID */
+
+/* Define to 1 if you have the `wcsftime' function. */
+#if defined(_MSC_VER) && _MSC_VER >= 1310
+#define HAVE_WCSFTIME 1
+#endif
+
+/* Define to 1 if you have the `wcscoll' function. */
+#define HAVE_WCSCOLL 1
+
+/* Define to 1 if you have the `wcsxfrm' function. */
+#define HAVE_WCSXFRM 1
+
+/* Define if the zlib library has inflateCopy */
+#define HAVE_ZLIB_COPY 1
+
+/* Define if you have the <dlfcn.h> header file.  */
+/* #undef HAVE_DLFCN_H */
+
+/* Define to 1 if you have the <errno.h> header file. */
+#define HAVE_ERRNO_H 1
+
+/* Define if you have the <fcntl.h> header file.  */
+#define HAVE_FCNTL_H 1
+
+/* Define to 1 if you have the <process.h> header file. */
+#define HAVE_PROCESS_H 1
+
+/* Define to 1 if you have the <signal.h> header file. */
+#define HAVE_SIGNAL_H 1
+
+/* Define if you have the <stddef.h> header file.  */
+#define HAVE_STDDEF_H 1
+
+/* Define if you have the <sys/audioio.h> header file.  */
+/* #undef HAVE_SYS_AUDIOIO_H */
+
+/* Define if you have the <sys/param.h> header file.  */
+/* #define HAVE_SYS_PARAM_H 1 */
+
+/* Define if you have the <sys/select.h> header file.  */
+/* #define HAVE_SYS_SELECT_H 1 */
+
+/* Define to 1 if you have the <sys/stat.h> header file.  */
+#define HAVE_SYS_STAT_H 1
+
+/* Define if you have the <sys/time.h> header file.  */
+/* #define HAVE_SYS_TIME_H 1 */
+
+/* Define if you have the <sys/times.h> header file.  */
+/* #define HAVE_SYS_TIMES_H 1 */
+
+/* Define to 1 if you have the <sys/types.h> header file.  */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define if you have the <sys/un.h> header file.  */
+/* #define HAVE_SYS_UN_H 1 */
+
+/* Define if you have the <sys/utime.h> header file.  */
+/* #define HAVE_SYS_UTIME_H 1 */
+
+/* Define if you have the <sys/utsname.h> header file.  */
+/* #define HAVE_SYS_UTSNAME_H 1 */
+
+/* Define if you have the <unistd.h> header file.  */
+/* #define HAVE_UNISTD_H 1 */
+
+/* Define if you have the <utime.h> header file.  */
+/* #define HAVE_UTIME_H 1 */
+
+/* Define if the compiler provides a wchar.h header file. */
+#define HAVE_WCHAR_H 1
+
+/* The size of `wchar_t', as computed by sizeof. */
+#define SIZEOF_WCHAR_T 2
+
+/* The size of `_Bool', as computed by sizeof. */
+#define SIZEOF__BOOL 1
+
+/* The size of `pid_t', as computed by sizeof. */
+#define SIZEOF_PID_T SIZEOF_INT
+
+/* Define if you have the dl library (-ldl).  */
+/* #undef HAVE_LIBDL */
+
+/* Define if you have the mpc library (-lmpc).  */
+/* #undef HAVE_LIBMPC */
+
+/* Define if you have the seq library (-lseq).  */
+/* #undef HAVE_LIBSEQ */
+
+/* Define if you have the socket library (-lsocket).  */
+#define HAVE_LIBSOCKET 1
+
+/* Define if you have the sun library (-lsun).  */
+/* #undef HAVE_LIBSUN */
+
+/* Define if you have the termcap library (-ltermcap).  */
+/* #undef HAVE_LIBTERMCAP */
+
+/* Define if you have the termlib library (-ltermlib).  */
+/* #undef HAVE_LIBTERMLIB */
+
+/* Define if you have the thread library (-lthread).  */
+/* #undef HAVE_LIBTHREAD */
+
+/* WinSock does not use a bitmask in select, and uses
+   socket handles greater than FD_SETSIZE */
+#define Py_SOCKET_FD_CAN_BE_GE_FD_SETSIZE
+
+/* Define if C doubles are 64-bit IEEE 754 binary format, stored with the
+   least significant byte first */
+#define DOUBLE_IS_LITTLE_ENDIAN_IEEE754 1
+
+/* Define to 1 if you have the `erf' function. */
+#define HAVE_ERF 1
+
+/* Define to 1 if you have the `erfc' function. */
+#define HAVE_ERFC 1
+
+// netdb.h functions (provided by winsock.h)
+#define HAVE_GETHOSTNAME 1
+#define HAVE_GETHOSTBYADDR 1
+#define HAVE_GETHOSTBYNAME 1
+#define HAVE_GETPROTOBYNAME 1
+#define HAVE_GETSERVBYNAME 1
+#define HAVE_GETSERVBYPORT 1
+// sys/socket.h functions (provided by winsock.h)
+#define HAVE_INET_PTON 1
+#define HAVE_INET_NTOA 1
+#define HAVE_ACCEPT 1
+#define HAVE_BIND 1
+#define HAVE_CONNECT 1
+#define HAVE_GETSOCKNAME 1
+#define HAVE_LISTEN 1
+#define HAVE_RECVFROM 1
+#define HAVE_SENDTO 1
+#define HAVE_SETSOCKOPT 1
+#define HAVE_SOCKET 1
+
+/* Define to 1 if you have the `dup' function. */
+#define HAVE_DUP 1
+
+/* framework name */
+#define _PYTHONFRAMEWORK ""
+
+/* Define if libssl has X509_VERIFY_PARAM_set1_host and related function */
+#define HAVE_X509_VERIFY_PARAM_SET1_HOST 1
+
+#endif /* !Py_CONFIG_H */
diff --git a/Include/pydtrace.h b/Include/pydtrace.h
new file mode 100644
index 0000000000000000000000000000000000000000..e197d36694537b188e8aaf8d8e89f770a3de96a1
--- /dev/null
+++ b/Include/pydtrace.h
@@ -0,0 +1,59 @@
+/* Static DTrace probes interface */
+
+#ifndef Py_DTRACE_H
+#define Py_DTRACE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef WITH_DTRACE
+
+#include "pydtrace_probes.h"
+
+/* pydtrace_probes.h, on systems with DTrace, is auto-generated to include
+   `PyDTrace_{PROBE}` and `PyDTrace_{PROBE}_ENABLED()` macros for every probe
+   defined in pydtrace.d.
+
+   Calling these functions must be guarded by a `PyDTrace_{PROBE}_ENABLED()`
+   check to minimize performance impact when probing is off. For example:
+
+       if (PyDTrace_FUNCTION_ENTRY_ENABLED())
+           PyDTrace_FUNCTION_ENTRY(f);
+*/
+
+#else
+
+/* Without DTrace, compile to nothing. */
+
+static inline void PyDTrace_LINE(const char *arg0, const char *arg1, int arg2) {}
+static inline void PyDTrace_FUNCTION_ENTRY(const char *arg0, const char *arg1, int arg2)  {}
+static inline void PyDTrace_FUNCTION_RETURN(const char *arg0, const char *arg1, int arg2) {}
+static inline void PyDTrace_GC_START(int arg0) {}
+static inline void PyDTrace_GC_DONE(Py_ssize_t arg0) {}
+static inline void PyDTrace_INSTANCE_NEW_START(int arg0) {}
+static inline void PyDTrace_INSTANCE_NEW_DONE(int arg0) {}
+static inline void PyDTrace_INSTANCE_DELETE_START(int arg0) {}
+static inline void PyDTrace_INSTANCE_DELETE_DONE(int arg0) {}
+static inline void PyDTrace_IMPORT_FIND_LOAD_START(const char *arg0) {}
+static inline void PyDTrace_IMPORT_FIND_LOAD_DONE(const char *arg0, int arg1) {}
+static inline void PyDTrace_AUDIT(const char *arg0, void *arg1) {}
+
+static inline int PyDTrace_LINE_ENABLED(void) { return 0; }
+static inline int PyDTrace_FUNCTION_ENTRY_ENABLED(void) { return 0; }
+static inline int PyDTrace_FUNCTION_RETURN_ENABLED(void) { return 0; }
+static inline int PyDTrace_GC_START_ENABLED(void) { return 0; }
+static inline int PyDTrace_GC_DONE_ENABLED(void) { return 0; }
+static inline int PyDTrace_INSTANCE_NEW_START_ENABLED(void) { return 0; }
+static inline int PyDTrace_INSTANCE_NEW_DONE_ENABLED(void) { return 0; }
+static inline int PyDTrace_INSTANCE_DELETE_START_ENABLED(void) { return 0; }
+static inline int PyDTrace_INSTANCE_DELETE_DONE_ENABLED(void) { return 0; }
+static inline int PyDTrace_IMPORT_FIND_LOAD_START_ENABLED(void) { return 0; }
+static inline int PyDTrace_IMPORT_FIND_LOAD_DONE_ENABLED(void) { return 0; }
+static inline int PyDTrace_AUDIT_ENABLED(void) { return 0; }
+
+#endif /* !WITH_DTRACE */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_DTRACE_H */
diff --git a/Include/pyerrors.h b/Include/pyerrors.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d0028c116e2d862948042746734e252e4aed0f6
--- /dev/null
+++ b/Include/pyerrors.h
@@ -0,0 +1,335 @@
+// Error handling definitions
+
+#ifndef Py_ERRORS_H
+#define Py_ERRORS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(void) PyErr_SetNone(PyObject *);
+PyAPI_FUNC(void) PyErr_SetObject(PyObject *, PyObject *);
+PyAPI_FUNC(void) PyErr_SetString(
+    PyObject *exception,
+    const char *string   /* decoded from utf-8 */
+    );
+PyAPI_FUNC(PyObject *) PyErr_Occurred(void);
+PyAPI_FUNC(void) PyErr_Clear(void);
+PyAPI_FUNC(void) PyErr_Fetch(PyObject **, PyObject **, PyObject **);
+PyAPI_FUNC(void) PyErr_Restore(PyObject *, PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyErr_GetRaisedException(void);
+PyAPI_FUNC(void) PyErr_SetRaisedException(PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030b0000
+PyAPI_FUNC(PyObject*) PyErr_GetHandledException(void);
+PyAPI_FUNC(void) PyErr_SetHandledException(PyObject *);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(void) PyErr_GetExcInfo(PyObject **, PyObject **, PyObject **);
+PyAPI_FUNC(void) PyErr_SetExcInfo(PyObject *, PyObject *, PyObject *);
+#endif
+
+/* Defined in Python/pylifecycle.c
+
+   The Py_FatalError() function is replaced with a macro which logs
+   automatically the name of the current function, unless the Py_LIMITED_API
+   macro is defined. */
+PyAPI_FUNC(void) _Py_NO_RETURN Py_FatalError(const char *message);
+
+/* Error testing and normalization */
+PyAPI_FUNC(int) PyErr_GivenExceptionMatches(PyObject *, PyObject *);
+PyAPI_FUNC(int) PyErr_ExceptionMatches(PyObject *);
+PyAPI_FUNC(void) PyErr_NormalizeException(PyObject**, PyObject**, PyObject**);
+
+/* Traceback manipulation (PEP 3134) */
+PyAPI_FUNC(int) PyException_SetTraceback(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyException_GetTraceback(PyObject *);
+
+/* Cause manipulation (PEP 3134) */
+PyAPI_FUNC(PyObject *) PyException_GetCause(PyObject *);
+PyAPI_FUNC(void) PyException_SetCause(PyObject *, PyObject *);
+
+/* Context manipulation (PEP 3134) */
+PyAPI_FUNC(PyObject *) PyException_GetContext(PyObject *);
+PyAPI_FUNC(void) PyException_SetContext(PyObject *, PyObject *);
+
+
+PyAPI_FUNC(PyObject *) PyException_GetArgs(PyObject *);
+PyAPI_FUNC(void) PyException_SetArgs(PyObject *, PyObject *);
+
+/* */
+
+#define PyExceptionClass_Check(x)                                       \
+    (PyType_Check((x)) &&                                               \
+     PyType_FastSubclass((PyTypeObject*)(x), Py_TPFLAGS_BASE_EXC_SUBCLASS))
+
+#define PyExceptionInstance_Check(x)                    \
+    PyType_FastSubclass(Py_TYPE(x), Py_TPFLAGS_BASE_EXC_SUBCLASS)
+
+PyAPI_FUNC(const char *) PyExceptionClass_Name(PyObject *);
+
+#define PyExceptionInstance_Class(x) _PyObject_CAST(Py_TYPE(x))
+
+#define _PyBaseExceptionGroup_Check(x)                   \
+    PyObject_TypeCheck((x), (PyTypeObject *)PyExc_BaseExceptionGroup)
+
+/* Predefined exceptions */
+
+PyAPI_DATA(PyObject *) PyExc_BaseException;
+PyAPI_DATA(PyObject *) PyExc_Exception;
+PyAPI_DATA(PyObject *) PyExc_BaseExceptionGroup;
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_DATA(PyObject *) PyExc_StopAsyncIteration;
+#endif
+PyAPI_DATA(PyObject *) PyExc_StopIteration;
+PyAPI_DATA(PyObject *) PyExc_GeneratorExit;
+PyAPI_DATA(PyObject *) PyExc_ArithmeticError;
+PyAPI_DATA(PyObject *) PyExc_LookupError;
+
+PyAPI_DATA(PyObject *) PyExc_AssertionError;
+PyAPI_DATA(PyObject *) PyExc_AttributeError;
+PyAPI_DATA(PyObject *) PyExc_BufferError;
+PyAPI_DATA(PyObject *) PyExc_EOFError;
+PyAPI_DATA(PyObject *) PyExc_FloatingPointError;
+PyAPI_DATA(PyObject *) PyExc_OSError;
+PyAPI_DATA(PyObject *) PyExc_ImportError;
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+PyAPI_DATA(PyObject *) PyExc_ModuleNotFoundError;
+#endif
+PyAPI_DATA(PyObject *) PyExc_IndexError;
+PyAPI_DATA(PyObject *) PyExc_KeyError;
+PyAPI_DATA(PyObject *) PyExc_KeyboardInterrupt;
+PyAPI_DATA(PyObject *) PyExc_MemoryError;
+PyAPI_DATA(PyObject *) PyExc_NameError;
+PyAPI_DATA(PyObject *) PyExc_OverflowError;
+PyAPI_DATA(PyObject *) PyExc_RuntimeError;
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_DATA(PyObject *) PyExc_RecursionError;
+#endif
+PyAPI_DATA(PyObject *) PyExc_NotImplementedError;
+PyAPI_DATA(PyObject *) PyExc_SyntaxError;
+PyAPI_DATA(PyObject *) PyExc_IndentationError;
+PyAPI_DATA(PyObject *) PyExc_TabError;
+PyAPI_DATA(PyObject *) PyExc_ReferenceError;
+PyAPI_DATA(PyObject *) PyExc_SystemError;
+PyAPI_DATA(PyObject *) PyExc_SystemExit;
+PyAPI_DATA(PyObject *) PyExc_TypeError;
+PyAPI_DATA(PyObject *) PyExc_UnboundLocalError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeEncodeError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeDecodeError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeTranslateError;
+PyAPI_DATA(PyObject *) PyExc_ValueError;
+PyAPI_DATA(PyObject *) PyExc_ZeroDivisionError;
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_DATA(PyObject *) PyExc_BlockingIOError;
+PyAPI_DATA(PyObject *) PyExc_BrokenPipeError;
+PyAPI_DATA(PyObject *) PyExc_ChildProcessError;
+PyAPI_DATA(PyObject *) PyExc_ConnectionError;
+PyAPI_DATA(PyObject *) PyExc_ConnectionAbortedError;
+PyAPI_DATA(PyObject *) PyExc_ConnectionRefusedError;
+PyAPI_DATA(PyObject *) PyExc_ConnectionResetError;
+PyAPI_DATA(PyObject *) PyExc_FileExistsError;
+PyAPI_DATA(PyObject *) PyExc_FileNotFoundError;
+PyAPI_DATA(PyObject *) PyExc_InterruptedError;
+PyAPI_DATA(PyObject *) PyExc_IsADirectoryError;
+PyAPI_DATA(PyObject *) PyExc_NotADirectoryError;
+PyAPI_DATA(PyObject *) PyExc_PermissionError;
+PyAPI_DATA(PyObject *) PyExc_ProcessLookupError;
+PyAPI_DATA(PyObject *) PyExc_TimeoutError;
+#endif
+
+
+/* Compatibility aliases */
+PyAPI_DATA(PyObject *) PyExc_EnvironmentError;
+PyAPI_DATA(PyObject *) PyExc_IOError;
+#ifdef MS_WINDOWS
+PyAPI_DATA(PyObject *) PyExc_WindowsError;
+#endif
+
+/* Predefined warning categories */
+PyAPI_DATA(PyObject *) PyExc_Warning;
+PyAPI_DATA(PyObject *) PyExc_UserWarning;
+PyAPI_DATA(PyObject *) PyExc_DeprecationWarning;
+PyAPI_DATA(PyObject *) PyExc_PendingDeprecationWarning;
+PyAPI_DATA(PyObject *) PyExc_SyntaxWarning;
+PyAPI_DATA(PyObject *) PyExc_RuntimeWarning;
+PyAPI_DATA(PyObject *) PyExc_FutureWarning;
+PyAPI_DATA(PyObject *) PyExc_ImportWarning;
+PyAPI_DATA(PyObject *) PyExc_UnicodeWarning;
+PyAPI_DATA(PyObject *) PyExc_BytesWarning;
+PyAPI_DATA(PyObject *) PyExc_EncodingWarning;
+PyAPI_DATA(PyObject *) PyExc_ResourceWarning;
+
+
+/* Convenience functions */
+
+PyAPI_FUNC(int) PyErr_BadArgument(void);
+PyAPI_FUNC(PyObject *) PyErr_NoMemory(void);
+PyAPI_FUNC(PyObject *) PyErr_SetFromErrno(PyObject *);
+PyAPI_FUNC(PyObject *) PyErr_SetFromErrnoWithFilenameObject(
+    PyObject *, PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03040000
+PyAPI_FUNC(PyObject *) PyErr_SetFromErrnoWithFilenameObjects(
+    PyObject *, PyObject *, PyObject *);
+#endif
+PyAPI_FUNC(PyObject *) PyErr_SetFromErrnoWithFilename(
+    PyObject *exc,
+    const char *filename   /* decoded from the filesystem encoding */
+    );
+
+PyAPI_FUNC(PyObject *) PyErr_Format(
+    PyObject *exception,
+    const char *format,   /* ASCII-encoded string  */
+    ...
+    );
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_FUNC(PyObject *) PyErr_FormatV(
+    PyObject *exception,
+    const char *format,
+    va_list vargs);
+#endif
+
+#ifdef MS_WINDOWS
+PyAPI_FUNC(PyObject *) PyErr_SetFromWindowsErrWithFilename(
+    int ierr,
+    const char *filename        /* decoded from the filesystem encoding */
+    );
+PyAPI_FUNC(PyObject *) PyErr_SetFromWindowsErr(int);
+PyAPI_FUNC(PyObject *) PyErr_SetExcFromWindowsErrWithFilenameObject(
+    PyObject *,int, PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03040000
+PyAPI_FUNC(PyObject *) PyErr_SetExcFromWindowsErrWithFilenameObjects(
+    PyObject *,int, PyObject *, PyObject *);
+#endif
+PyAPI_FUNC(PyObject *) PyErr_SetExcFromWindowsErrWithFilename(
+    PyObject *exc,
+    int ierr,
+    const char *filename        /* decoded from the filesystem encoding */
+    );
+PyAPI_FUNC(PyObject *) PyErr_SetExcFromWindowsErr(PyObject *, int);
+#endif /* MS_WINDOWS */
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+PyAPI_FUNC(PyObject *) PyErr_SetImportErrorSubclass(PyObject *, PyObject *,
+    PyObject *, PyObject *);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyErr_SetImportError(PyObject *, PyObject *,
+    PyObject *);
+#endif
+
+/* Export the old function so that the existing API remains available: */
+PyAPI_FUNC(void) PyErr_BadInternalCall(void);
+PyAPI_FUNC(void) _PyErr_BadInternalCall(const char *filename, int lineno);
+/* Mask the old API with a call to the new API for code compiled under
+   Python 2.0: */
+#define PyErr_BadInternalCall() _PyErr_BadInternalCall(__FILE__, __LINE__)
+
+/* Function to create a new exception */
+PyAPI_FUNC(PyObject *) PyErr_NewException(
+    const char *name, PyObject *base, PyObject *dict);
+PyAPI_FUNC(PyObject *) PyErr_NewExceptionWithDoc(
+    const char *name, const char *doc, PyObject *base, PyObject *dict);
+PyAPI_FUNC(void) PyErr_WriteUnraisable(PyObject *);
+
+
+/* In signalmodule.c */
+PyAPI_FUNC(int) PyErr_CheckSignals(void);
+PyAPI_FUNC(void) PyErr_SetInterrupt(void);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+PyAPI_FUNC(int) PyErr_SetInterruptEx(int signum);
+#endif
+
+/* Support for adding program text to SyntaxErrors */
+PyAPI_FUNC(void) PyErr_SyntaxLocation(
+    const char *filename,       /* decoded from the filesystem encoding */
+    int lineno);
+PyAPI_FUNC(void) PyErr_SyntaxLocationEx(
+    const char *filename,       /* decoded from the filesystem encoding */
+    int lineno,
+    int col_offset);
+PyAPI_FUNC(PyObject *) PyErr_ProgramText(
+    const char *filename,       /* decoded from the filesystem encoding */
+    int lineno);
+
+/* The following functions are used to create and modify unicode
+   exceptions from C */
+
+/* create a UnicodeDecodeError object */
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_Create(
+    const char *encoding,       /* UTF-8 encoded string */
+    const char *object,
+    Py_ssize_t length,
+    Py_ssize_t start,
+    Py_ssize_t end,
+    const char *reason          /* UTF-8 encoded string */
+    );
+
+/* get the encoding attribute */
+PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetEncoding(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetEncoding(PyObject *);
+
+/* get the object attribute */
+PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetObject(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetObject(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetObject(PyObject *);
+
+/* get the value of the start attribute (the int * may not be NULL)
+   return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_GetStart(PyObject *, Py_ssize_t *);
+PyAPI_FUNC(int) PyUnicodeDecodeError_GetStart(PyObject *, Py_ssize_t *);
+PyAPI_FUNC(int) PyUnicodeTranslateError_GetStart(PyObject *, Py_ssize_t *);
+
+/* assign a new value to the start attribute
+   return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_SetStart(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyUnicodeDecodeError_SetStart(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyUnicodeTranslateError_SetStart(PyObject *, Py_ssize_t);
+
+/* get the value of the end attribute (the int *may not be NULL)
+ return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_GetEnd(PyObject *, Py_ssize_t *);
+PyAPI_FUNC(int) PyUnicodeDecodeError_GetEnd(PyObject *, Py_ssize_t *);
+PyAPI_FUNC(int) PyUnicodeTranslateError_GetEnd(PyObject *, Py_ssize_t *);
+
+/* assign a new value to the end attribute
+   return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_SetEnd(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyUnicodeDecodeError_SetEnd(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyUnicodeTranslateError_SetEnd(PyObject *, Py_ssize_t);
+
+/* get the value of the reason attribute */
+PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetReason(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetReason(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetReason(PyObject *);
+
+/* assign a new value to the reason attribute
+   return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_SetReason(
+    PyObject *exc,
+    const char *reason          /* UTF-8 encoded string */
+    );
+PyAPI_FUNC(int) PyUnicodeDecodeError_SetReason(
+    PyObject *exc,
+    const char *reason          /* UTF-8 encoded string */
+    );
+PyAPI_FUNC(int) PyUnicodeTranslateError_SetReason(
+    PyObject *exc,
+    const char *reason          /* UTF-8 encoded string */
+    );
+
+PyAPI_FUNC(int) PyOS_snprintf(char *str, size_t size, const char  *format, ...)
+                        Py_GCC_ATTRIBUTE((format(printf, 3, 4)));
+PyAPI_FUNC(int) PyOS_vsnprintf(char *str, size_t size, const char  *format, va_list va)
+                        Py_GCC_ATTRIBUTE((format(printf, 3, 0)));
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_ERRORS_H
+#  include "cpython/pyerrors.h"
+#  undef Py_CPYTHON_ERRORS_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_ERRORS_H */
diff --git a/Include/pyexpat.h b/Include/pyexpat.h
new file mode 100644
index 0000000000000000000000000000000000000000..9824d099c3df7d08083ee6f9232dedcbe9d846a0
--- /dev/null
+++ b/Include/pyexpat.h
@@ -0,0 +1,57 @@
+/* Stuff to export relevant 'expat' entry points from pyexpat to other
+ * parser modules, such as cElementTree. */
+
+/* note: you must import expat.h before importing this module! */
+
+#define PyExpat_CAPI_MAGIC  "pyexpat.expat_CAPI 1.1"
+#define PyExpat_CAPSULE_NAME "pyexpat.expat_CAPI"
+
+struct PyExpat_CAPI
+{
+    char* magic; /* set to PyExpat_CAPI_MAGIC */
+    int size; /* set to sizeof(struct PyExpat_CAPI) */
+    int MAJOR_VERSION;
+    int MINOR_VERSION;
+    int MICRO_VERSION;
+    /* pointers to selected expat functions.  add new functions at
+       the end, if needed */
+    const XML_LChar * (*ErrorString)(enum XML_Error code);
+    enum XML_Error (*GetErrorCode)(XML_Parser parser);
+    XML_Size (*GetErrorColumnNumber)(XML_Parser parser);
+    XML_Size (*GetErrorLineNumber)(XML_Parser parser);
+    enum XML_Status (*Parse)(
+        XML_Parser parser, const char *s, int len, int isFinal);
+    XML_Parser (*ParserCreate_MM)(
+        const XML_Char *encoding, const XML_Memory_Handling_Suite *memsuite,
+        const XML_Char *namespaceSeparator);
+    void (*ParserFree)(XML_Parser parser);
+    void (*SetCharacterDataHandler)(
+        XML_Parser parser, XML_CharacterDataHandler handler);
+    void (*SetCommentHandler)(
+        XML_Parser parser, XML_CommentHandler handler);
+    void (*SetDefaultHandlerExpand)(
+        XML_Parser parser, XML_DefaultHandler handler);
+    void (*SetElementHandler)(
+        XML_Parser parser, XML_StartElementHandler start,
+        XML_EndElementHandler end);
+    void (*SetNamespaceDeclHandler)(
+        XML_Parser parser, XML_StartNamespaceDeclHandler start,
+        XML_EndNamespaceDeclHandler end);
+    void (*SetProcessingInstructionHandler)(
+        XML_Parser parser, XML_ProcessingInstructionHandler handler);
+    void (*SetUnknownEncodingHandler)(
+        XML_Parser parser, XML_UnknownEncodingHandler handler,
+        void *encodingHandlerData);
+    void (*SetUserData)(XML_Parser parser, void *userData);
+    void (*SetStartDoctypeDeclHandler)(XML_Parser parser,
+                                       XML_StartDoctypeDeclHandler start);
+    enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding);
+    int (*DefaultUnknownEncodingHandler)(
+        void *encodingHandlerData, const XML_Char *name, XML_Encoding *info);
+    /* might be NULL for expat < 2.1.0 */
+    int (*SetHashSalt)(XML_Parser parser, unsigned long hash_salt);
+    /* might be NULL for expat < 2.6.0 */
+    XML_Bool (*SetReparseDeferralEnabled)(XML_Parser parser, XML_Bool enabled);
+    /* always add new stuff to the end! */
+};
+
diff --git a/Include/pyframe.h b/Include/pyframe.h
new file mode 100644
index 0000000000000000000000000000000000000000..13d52312ea966e43322950405d06440e0c988fd9
--- /dev/null
+++ b/Include/pyframe.h
@@ -0,0 +1,26 @@
+/* Limited C API of PyFrame API
+ *
+ * Include "frameobject.h" to get the PyFrameObject structure.
+ */
+
+#ifndef Py_PYFRAME_H
+#define Py_PYFRAME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Return the line of code the frame is currently executing. */
+PyAPI_FUNC(int) PyFrame_GetLineNumber(PyFrameObject *);
+
+PyAPI_FUNC(PyCodeObject *) PyFrame_GetCode(PyFrameObject *frame);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYFRAME_H
+#  include "cpython/pyframe.h"
+#  undef Py_CPYTHON_PYFRAME_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYFRAME_H */
diff --git a/Include/pyhash.h b/Include/pyhash.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e23e2758808d793ae8ffe716e898a0bceb6f3d3
--- /dev/null
+++ b/Include/pyhash.h
@@ -0,0 +1,59 @@
+#ifndef Py_HASH_H
+#define Py_HASH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Cutoff for small string DJBX33A optimization in range [1, cutoff).
+ *
+ * About 50% of the strings in a typical Python application are smaller than
+ * 6 to 7 chars. However DJBX33A is vulnerable to hash collision attacks.
+ * NEVER use DJBX33A for long strings!
+ *
+ * A Py_HASH_CUTOFF of 0 disables small string optimization. 32 bit platforms
+ * should use a smaller cutoff because it is easier to create colliding
+ * strings. A cutoff of 7 on 64bit platforms and 5 on 32bit platforms should
+ * provide a decent safety margin.
+ */
+#ifndef Py_HASH_CUTOFF
+#  define Py_HASH_CUTOFF 0
+#elif (Py_HASH_CUTOFF > 7 || Py_HASH_CUTOFF < 0)
+#  error Py_HASH_CUTOFF must in range 0...7.
+#endif /* Py_HASH_CUTOFF */
+
+
+/* Hash algorithm selection
+ *
+ * The values for Py_HASH_* are hard-coded in the
+ * configure script.
+ *
+ * - FNV and SIPHASH* are available on all platforms and architectures.
+ * - With EXTERNAL embedders can provide an alternative implementation with::
+ *
+ *     PyHash_FuncDef PyHash_Func = {...};
+ *
+ * XXX: Figure out __declspec() for extern PyHash_FuncDef.
+ */
+#define Py_HASH_EXTERNAL 0
+#define Py_HASH_SIPHASH24 1
+#define Py_HASH_FNV 2
+#define Py_HASH_SIPHASH13 3
+
+#ifndef Py_HASH_ALGORITHM
+#  ifndef HAVE_ALIGNED_REQUIRED
+#    define Py_HASH_ALGORITHM Py_HASH_SIPHASH13
+#  else
+#    define Py_HASH_ALGORITHM Py_HASH_FNV
+#  endif /* uint64_t && uint32_t && aligned */
+#endif /* Py_HASH_ALGORITHM */
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_HASH_H
+#  include "cpython/pyhash.h"
+#  undef Py_CPYTHON_HASH_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_HASH_H
diff --git a/Include/pylifecycle.h b/Include/pylifecycle.h
new file mode 100644
index 0000000000000000000000000000000000000000..de1bcb1d2cb632b026220bc02808352915b61c34
--- /dev/null
+++ b/Include/pylifecycle.h
@@ -0,0 +1,80 @@
+
+/* Interfaces to configure, query, create & destroy the Python runtime */
+
+#ifndef Py_PYLIFECYCLE_H
+#define Py_PYLIFECYCLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Initialization and finalization */
+PyAPI_FUNC(void) Py_Initialize(void);
+PyAPI_FUNC(void) Py_InitializeEx(int);
+PyAPI_FUNC(void) Py_Finalize(void);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+PyAPI_FUNC(int) Py_FinalizeEx(void);
+#endif
+PyAPI_FUNC(int) Py_IsInitialized(void);
+
+/* Subinterpreter support */
+PyAPI_FUNC(PyThreadState *) Py_NewInterpreter(void);
+PyAPI_FUNC(void) Py_EndInterpreter(PyThreadState *);
+
+
+/* Py_PyAtExit is for the atexit module, Py_AtExit is for low-level
+ * exit functions.
+ */
+PyAPI_FUNC(int) Py_AtExit(void (*func)(void));
+
+PyAPI_FUNC(void) _Py_NO_RETURN Py_Exit(int);
+
+/* Bootstrap __main__ (defined in Modules/main.c) */
+PyAPI_FUNC(int) Py_Main(int argc, wchar_t **argv);
+PyAPI_FUNC(int) Py_BytesMain(int argc, char **argv);
+
+/* In pathconfig.c */
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) Py_SetProgramName(const wchar_t *);
+Py_DEPRECATED(3.13) PyAPI_FUNC(wchar_t *) Py_GetProgramName(void);
+
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) Py_SetPythonHome(const wchar_t *);
+Py_DEPRECATED(3.13) PyAPI_FUNC(wchar_t *) Py_GetPythonHome(void);
+
+Py_DEPRECATED(3.13) PyAPI_FUNC(wchar_t *) Py_GetProgramFullPath(void);
+Py_DEPRECATED(3.13) PyAPI_FUNC(wchar_t *) Py_GetPrefix(void);
+Py_DEPRECATED(3.13) PyAPI_FUNC(wchar_t *) Py_GetExecPrefix(void);
+Py_DEPRECATED(3.13) PyAPI_FUNC(wchar_t *) Py_GetPath(void);
+#ifdef MS_WINDOWS
+int _Py_CheckPython3(void);
+#endif
+
+/* In their own files */
+PyAPI_FUNC(const char *) Py_GetVersion(void);
+PyAPI_FUNC(const char *) Py_GetPlatform(void);
+PyAPI_FUNC(const char *) Py_GetCopyright(void);
+PyAPI_FUNC(const char *) Py_GetCompiler(void);
+PyAPI_FUNC(const char *) Py_GetBuildInfo(void);
+
+/* Signals */
+typedef void (*PyOS_sighandler_t)(int);
+PyAPI_FUNC(PyOS_sighandler_t) PyOS_getsig(int);
+PyAPI_FUNC(PyOS_sighandler_t) PyOS_setsig(int, PyOS_sighandler_t);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030B0000
+PyAPI_DATA(const unsigned long) Py_Version;
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
+PyAPI_FUNC(int) Py_IsFinalizing(void);
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYLIFECYCLE_H
+#  include "cpython/pylifecycle.h"
+#  undef Py_CPYTHON_PYLIFECYCLE_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYLIFECYCLE_H */
diff --git a/Include/pymacconfig.h b/Include/pymacconfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..615abe103ca0388c7ce67f142676d64bb8ef1697
--- /dev/null
+++ b/Include/pymacconfig.h
@@ -0,0 +1,91 @@
+// This file moves some of the autoconf magic to compile-time when building on
+// macOS. This is needed for building 4-way universal binaries and for 64-bit
+// universal binaries because the values redefined below aren't configure-time
+// constant but only compile-time constant in these scenarios.
+
+#ifndef PY_MACCONFIG_H
+#define PY_MACCONFIG_H
+#ifdef __APPLE__
+
+#undef ALIGNOF_MAX_ALIGN_T
+#undef SIZEOF_LONG
+#undef SIZEOF_LONG_DOUBLE
+#undef SIZEOF_PTHREAD_T
+#undef SIZEOF_SIZE_T
+#undef SIZEOF_TIME_T
+#undef SIZEOF_VOID_P
+#undef SIZEOF__BOOL
+#undef SIZEOF_UINTPTR_T
+#undef SIZEOF_PTHREAD_T
+#undef WORDS_BIGENDIAN
+#undef DOUBLE_IS_ARM_MIXED_ENDIAN_IEEE754
+#undef DOUBLE_IS_BIG_ENDIAN_IEEE754
+#undef DOUBLE_IS_LITTLE_ENDIAN_IEEE754
+#undef HAVE_GCC_ASM_FOR_X87
+#undef HAVE_GCC_ASM_FOR_X64
+
+#undef VA_LIST_IS_ARRAY
+#if defined(__LP64__) && defined(__x86_64__)
+#  define VA_LIST_IS_ARRAY 1
+#endif
+
+#undef HAVE_LARGEFILE_SUPPORT
+#ifndef __LP64__
+#   define HAVE_LARGEFILE_SUPPORT 1
+#endif
+
+#undef SIZEOF_LONG
+#ifdef __LP64__
+#  define SIZEOF__BOOL            1
+#  define SIZEOF__BOOL            1
+#  define SIZEOF_LONG             8
+#  define SIZEOF_PTHREAD_T        8
+#  define SIZEOF_SIZE_T           8
+#  define SIZEOF_TIME_T           8
+#  define SIZEOF_VOID_P           8
+#  define SIZEOF_UINTPTR_T        8
+#  define SIZEOF_PTHREAD_T        8
+#else
+#  ifdef __ppc__
+#     define SIZEOF__BOOL         4
+#  else
+#     define SIZEOF__BOOL         1
+#  endif
+#  define SIZEOF_LONG             4
+#  define SIZEOF_PTHREAD_T        4
+#  define SIZEOF_SIZE_T           4
+#  define SIZEOF_TIME_T           4
+#  define SIZEOF_VOID_P           4
+#  define SIZEOF_UINTPTR_T        4
+#  define SIZEOF_PTHREAD_T        4
+#endif
+
+// macOS 10.4 (the first release to support 64-bit code
+// at all) only supports 64-bit in the UNIX layer.
+// Therefore suppress the toolbox-glue in 64-bit mode.
+//
+// In 64-bit mode setpgrp always has no arguments, in 32-bit
+// mode that depends on the compilation environment
+#if defined(__LP64__)
+#   undef SETPGRP_HAVE_ARG
+#endif
+
+#ifdef __BIG_ENDIAN__
+#  define WORDS_BIGENDIAN 1
+#  define DOUBLE_IS_BIG_ENDIAN_IEEE754
+#else
+#  define DOUBLE_IS_LITTLE_ENDIAN_IEEE754
+#endif
+
+#if defined(__i386__) || defined(__x86_64__)
+#  define HAVE_GCC_ASM_FOR_X87
+#  define ALIGNOF_MAX_ALIGN_T 16
+#  define HAVE_GCC_ASM_FOR_X64 1
+#  define SIZEOF_LONG_DOUBLE 16
+#else
+#  define ALIGNOF_MAX_ALIGN_T 8
+#  define SIZEOF_LONG_DOUBLE 8
+#endif
+
+#endif   // __APPLE__
+#endif   // !PY_MACCONFIG_H
diff --git a/Include/pymacro.h b/Include/pymacro.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0378f9d27a048c26b87fe4b3ae3869aaae5cc9b
--- /dev/null
+++ b/Include/pymacro.h
@@ -0,0 +1,193 @@
+#ifndef Py_PYMACRO_H
+#define Py_PYMACRO_H
+
+// gh-91782: On FreeBSD 12, if the _POSIX_C_SOURCE and _XOPEN_SOURCE macros are
+// defined, <sys/cdefs.h> disables C11 support and <assert.h> does not define
+// the static_assert() macro.
+// https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=255290
+//
+// macOS <= 10.10 doesn't define static_assert in assert.h at all despite
+// having C11 compiler support.
+//
+// static_assert is defined in glibc from version 2.16. Compiler support for
+// the C11 _Static_assert keyword is in gcc >= 4.6.
+//
+// MSVC makes static_assert a keyword in C11-17, contrary to the standards.
+//
+// In C++11 and C2x, static_assert is a keyword, redefining is undefined
+// behaviour. So only define if building as C, not C++ (if __cplusplus is
+// not defined), and only for C11-17.
+#if !defined(static_assert) && (defined(__GNUC__) || defined(__clang__)) \
+     && !defined(__cplusplus) && defined(__STDC_VERSION__) \
+     && __STDC_VERSION__ >= 201112L && __STDC_VERSION__ <= 201710L
+#  define static_assert _Static_assert
+#endif
+
+/* Minimum value between x and y */
+#define Py_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+/* Maximum value between x and y */
+#define Py_MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+/* Absolute value of the number x */
+#define Py_ABS(x) ((x) < 0 ? -(x) : (x))
+
+#define _Py_XSTRINGIFY(x) #x
+
+/* Convert the argument to a string. For example, Py_STRINGIFY(123) is replaced
+   with "123" by the preprocessor. Defines are also replaced by their value.
+   For example Py_STRINGIFY(__LINE__) is replaced by the line number, not
+   by "__LINE__". */
+#define Py_STRINGIFY(x) _Py_XSTRINGIFY(x)
+
+/* Get the size of a structure member in bytes */
+#define Py_MEMBER_SIZE(type, member) sizeof(((type *)0)->member)
+
+/* Argument must be a char or an int in [-128, 127] or [0, 255]. */
+#define Py_CHARMASK(c) ((unsigned char)((c) & 0xff))
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L \
+     && !defined(__cplusplus) && !defined(_MSC_VER))
+#  define Py_BUILD_ASSERT_EXPR(cond) \
+    ((void)sizeof(struct { int dummy; _Static_assert(cond, #cond); }), \
+     0)
+#else
+   /* Assert a build-time dependency, as an expression.
+    *
+    * Your compile will fail if the condition isn't true, or can't be evaluated
+    * by the compiler. This can be used in an expression: its value is 0.
+    *
+    * Example:
+    *
+    * #define foo_to_char(foo)  \
+    *     ((char *)(foo)        \
+    *      + Py_BUILD_ASSERT_EXPR(offsetof(struct foo, string) == 0))
+    *
+    * Written by Rusty Russell, public domain, http://ccodearchive.net/
+    */
+#  define Py_BUILD_ASSERT_EXPR(cond) \
+    (sizeof(char [1 - 2*!(cond)]) - 1)
+#endif
+
+#if ((defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) \
+     || (defined(__cplusplus) && __cplusplus >= 201103L))
+   // Use static_assert() on C11 and newer
+#  define Py_BUILD_ASSERT(cond) \
+        do { \
+            static_assert((cond), #cond); \
+        } while (0)
+#else
+#  define Py_BUILD_ASSERT(cond)  \
+        do { \
+            (void)Py_BUILD_ASSERT_EXPR(cond); \
+        } while(0)
+#endif
+
+/* Get the number of elements in a visible array
+
+   This does not work on pointers, or arrays declared as [], or function
+   parameters. With correct compiler support, such usage will cause a build
+   error (see Py_BUILD_ASSERT_EXPR).
+
+   Written by Rusty Russell, public domain, http://ccodearchive.net/
+
+   Requires at GCC 3.1+ */
+#if (defined(__GNUC__) && !defined(__STRICT_ANSI__) && \
+    (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) || (__GNUC__ >= 4)))
+/* Two gcc extensions.
+   &a[0] degrades to a pointer: a different type from an array */
+#define Py_ARRAY_LENGTH(array) \
+    (sizeof(array) / sizeof((array)[0]) \
+     + Py_BUILD_ASSERT_EXPR(!__builtin_types_compatible_p(typeof(array), \
+                                                          typeof(&(array)[0]))))
+#else
+#define Py_ARRAY_LENGTH(array) \
+    (sizeof(array) / sizeof((array)[0]))
+#endif
+
+
+/* Define macros for inline documentation. */
+#define PyDoc_VAR(name) static const char name[]
+#define PyDoc_STRVAR(name,str) PyDoc_VAR(name) = PyDoc_STR(str)
+#ifdef WITH_DOC_STRINGS
+#define PyDoc_STR(str) str
+#else
+#define PyDoc_STR(str) ""
+#endif
+
+/* Below "a" is a power of 2. */
+/* Round down size "n" to be a multiple of "a". */
+#define _Py_SIZE_ROUND_DOWN(n, a) ((size_t)(n) & ~(size_t)((a) - 1))
+/* Round up size "n" to be a multiple of "a". */
+#define _Py_SIZE_ROUND_UP(n, a) (((size_t)(n) + \
+        (size_t)((a) - 1)) & ~(size_t)((a) - 1))
+/* Round pointer "p" down to the closest "a"-aligned address <= "p". */
+#define _Py_ALIGN_DOWN(p, a) ((void *)((uintptr_t)(p) & ~(uintptr_t)((a) - 1)))
+/* Round pointer "p" up to the closest "a"-aligned address >= "p". */
+#define _Py_ALIGN_UP(p, a) ((void *)(((uintptr_t)(p) + \
+        (uintptr_t)((a) - 1)) & ~(uintptr_t)((a) - 1)))
+/* Check if pointer "p" is aligned to "a"-bytes boundary. */
+#define _Py_IS_ALIGNED(p, a) (!((uintptr_t)(p) & (uintptr_t)((a) - 1)))
+
+/* Use this for unused arguments in a function definition to silence compiler
+ * warnings. Example:
+ *
+ * int func(int a, int Py_UNUSED(b)) { return a; }
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#  define Py_UNUSED(name) _unused_ ## name __attribute__((unused))
+#elif defined(_MSC_VER)
+   // Disable warning C4100: unreferenced formal parameter,
+   // declare the parameter,
+   // restore old compiler warnings.
+#  define Py_UNUSED(name) \
+        __pragma(warning(push)) \
+        __pragma(warning(suppress: 4100)) \
+        _unused_ ## name \
+        __pragma(warning(pop))
+#else
+#  define Py_UNUSED(name) _unused_ ## name
+#endif
+
+#if defined(RANDALL_WAS_HERE)
+#  define Py_UNREACHABLE() \
+    Py_FatalError( \
+        "If you're seeing this, the code is in what I thought was\n" \
+        "an unreachable state.\n\n" \
+        "I could give you advice for what to do, but honestly, why\n" \
+        "should you trust me?  I clearly screwed this up.  I'm writing\n" \
+        "a message that should never appear, yet I know it will\n" \
+        "probably appear someday.\n\n" \
+        "On a deep level, I know I'm not up to this task.\n" \
+        "I'm so sorry.\n" \
+        "https://xkcd.com/2200")
+#elif defined(Py_DEBUG)
+#  define Py_UNREACHABLE() \
+    Py_FatalError( \
+        "We've reached an unreachable state. Anything is possible.\n" \
+        "The limits were in our heads all along. Follow your dreams.\n" \
+        "https://xkcd.com/2200")
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
+#  define Py_UNREACHABLE() __builtin_unreachable()
+#elif defined(__clang__) || defined(__INTEL_COMPILER)
+#  define Py_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#  define Py_UNREACHABLE() __assume(0)
+#else
+#  define Py_UNREACHABLE() \
+    Py_FatalError("Unreachable C code path reached")
+#endif
+
+#define _Py_CONTAINER_OF(ptr, type, member) \
+    (type*)((char*)ptr - offsetof(type, member))
+
+// Prevent using an expression as a l-value.
+// For example, "int x; _Py_RVALUE(x) = 1;" fails with a compiler error.
+#define _Py_RVALUE(EXPR) ((void)0, (EXPR))
+
+// Return non-zero if the type is signed, return zero if it's unsigned.
+// Use "<= 0" rather than "< 0" to prevent the compiler warning:
+// "comparison of unsigned expression in '< 0' is always false".
+#define _Py_IS_TYPE_SIGNED(type) ((type)(-1) <= 0)
+
+#endif /* Py_PYMACRO_H */
diff --git a/Include/pymath.h b/Include/pymath.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c1e3d9984894b0cb27dfb387e3e3b8781e14091
--- /dev/null
+++ b/Include/pymath.h
@@ -0,0 +1,62 @@
+// Symbols and macros to supply platform-independent interfaces to mathematical
+// functions and constants.
+
+#ifndef Py_PYMATH_H
+#define Py_PYMATH_H
+
+/* High precision definition of pi and e (Euler)
+ * The values are taken from libc6's math.h.
+ */
+#ifndef Py_MATH_PIl
+#define Py_MATH_PIl 3.1415926535897932384626433832795029L
+#endif
+#ifndef Py_MATH_PI
+#define Py_MATH_PI 3.14159265358979323846
+#endif
+
+#ifndef Py_MATH_El
+#define Py_MATH_El 2.7182818284590452353602874713526625L
+#endif
+
+#ifndef Py_MATH_E
+#define Py_MATH_E 2.7182818284590452354
+#endif
+
+/* Tau (2pi) to 40 digits, taken from tauday.com/tau-digits. */
+#ifndef Py_MATH_TAU
+#define Py_MATH_TAU 6.2831853071795864769252867665590057683943L
+#endif
+
+// Py_IS_NAN(X)
+// Return 1 if float or double arg is a NaN, else 0.
+#define Py_IS_NAN(X) isnan(X)
+
+// Py_IS_INFINITY(X)
+// Return 1 if float or double arg is an infinity, else 0.
+#define Py_IS_INFINITY(X) isinf(X)
+
+// Py_IS_FINITE(X)
+// Return 1 if float or double arg is neither infinite nor NAN, else 0.
+#define Py_IS_FINITE(X) isfinite(X)
+
+// Py_INFINITY: Value that evaluates to a positive double infinity.
+#ifndef Py_INFINITY
+#  define Py_INFINITY ((double)INFINITY)
+#endif
+
+/* Py_HUGE_VAL should always be the same as Py_INFINITY.  But historically
+ * this was not reliable and Python did not require IEEE floats and C99
+ * conformity.  Prefer Py_INFINITY for new code.
+ */
+#ifndef Py_HUGE_VAL
+#  define Py_HUGE_VAL HUGE_VAL
+#endif
+
+/* Py_NAN: Value that evaluates to a quiet Not-a-Number (NaN).  The sign is
+ * undefined and normally not relevant, but e.g. fixed for float("nan").
+ */
+#if !defined(Py_NAN)
+#    define Py_NAN ((double)NAN)
+#endif
+
+#endif /* Py_PYMATH_H */
diff --git a/Include/pymem.h b/Include/pymem.h
new file mode 100644
index 0000000000000000000000000000000000000000..a80da99e1dd7fc5710e22f780e75111e0c12f21b
--- /dev/null
+++ b/Include/pymem.h
@@ -0,0 +1,110 @@
+// The PyMem_ family:  low-level memory allocation interfaces.
+// See objimpl.h for the PyObject_ memory family.
+
+#ifndef Py_PYMEM_H
+#define Py_PYMEM_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* BEWARE:
+
+   Each interface exports both functions and macros.  Extension modules should
+   use the functions, to ensure binary compatibility across Python versions.
+   Because the Python implementation is free to change internal details, and
+   the macros may (or may not) expose details for speed, if you do use the
+   macros you must recompile your extensions with each Python release.
+
+   Never mix calls to PyMem_ with calls to the platform malloc/realloc/
+   calloc/free.  For example, on Windows different DLLs may end up using
+   different heaps, and if you use PyMem_Malloc you'll get the memory from the
+   heap used by the Python DLL; it could be a disaster if you free()'ed that
+   directly in your own extension.  Using PyMem_Free instead ensures Python
+   can return the memory to the proper heap.  As another example, in
+   a debug build (Py_DEBUG macro), Python wraps all calls to all PyMem_ and
+   PyObject_ memory functions in special debugging wrappers that add additional
+   debugging info to dynamic memory blocks.  The system routines have no idea
+   what to do with that stuff, and the Python wrappers have no idea what to do
+   with raw blocks obtained directly by the system routines then.
+
+   The GIL must be held when using these APIs.
+*/
+
+/*
+ * Raw memory interface
+ * ====================
+ */
+
+/* Functions
+
+   Functions supplying platform-independent semantics for malloc/realloc/
+   free.  These functions make sure that allocating 0 bytes returns a distinct
+   non-NULL pointer (whenever possible -- if we're flat out of memory, NULL
+   may be returned), even if the platform malloc and realloc don't.
+   Returned pointers must be checked for NULL explicitly.  No action is
+   performed on failure (no exception is set, no warning is printed, etc).
+*/
+
+PyAPI_FUNC(void *) PyMem_Malloc(size_t size);
+PyAPI_FUNC(void *) PyMem_Calloc(size_t nelem, size_t elsize);
+PyAPI_FUNC(void *) PyMem_Realloc(void *ptr, size_t new_size);
+PyAPI_FUNC(void) PyMem_Free(void *ptr);
+
+/*
+ * Type-oriented memory interface
+ * ==============================
+ *
+ * Allocate memory for n objects of the given type.  Returns a new pointer
+ * or NULL if the request was too large or memory allocation failed.  Use
+ * these macros rather than doing the multiplication yourself so that proper
+ * overflow checking is always done.
+ */
+
+#define PyMem_New(type, n) \
+  ( ((size_t)(n) > PY_SSIZE_T_MAX / sizeof(type)) ? NULL :      \
+        ( (type *) PyMem_Malloc((n) * sizeof(type)) ) )
+
+/*
+ * The value of (p) is always clobbered by this macro regardless of success.
+ * The caller MUST check if (p) is NULL afterwards and deal with the memory
+ * error if so.  This means the original value of (p) MUST be saved for the
+ * caller's memory error handler to not lose track of it.
+ */
+#define PyMem_Resize(p, type, n) \
+  ( (p) = ((size_t)(n) > PY_SSIZE_T_MAX / sizeof(type)) ? NULL :        \
+        (type *) PyMem_Realloc((p), (n) * sizeof(type)) )
+
+
+// Deprecated aliases only kept for backward compatibility.
+// PyMem_Del and PyMem_DEL are defined with no parameter to be able to use
+// them as function pointers (ex: dealloc = PyMem_Del).
+#define PyMem_MALLOC(n)           PyMem_Malloc((n))
+#define PyMem_NEW(type, n)        PyMem_New(type, (n))
+#define PyMem_REALLOC(p, n)       PyMem_Realloc((p), (n))
+#define PyMem_RESIZE(p, type, n)  PyMem_Resize((p), type, (n))
+#define PyMem_FREE(p)             PyMem_Free((p))
+#define PyMem_Del(p)              PyMem_Free((p))
+#define PyMem_DEL(p)              PyMem_Free((p))
+
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+// Memory allocator which doesn't require the GIL to be held.
+// Usually, it's just a thin wrapper to functions of the standard C library:
+// malloc(), calloc(), realloc() and free(). The difference is that
+// tracemalloc can track these memory allocations.
+PyAPI_FUNC(void *) PyMem_RawMalloc(size_t size);
+PyAPI_FUNC(void *) PyMem_RawCalloc(size_t nelem, size_t elsize);
+PyAPI_FUNC(void *) PyMem_RawRealloc(void *ptr, size_t new_size);
+PyAPI_FUNC(void) PyMem_RawFree(void *ptr);
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYMEM_H
+#  include "cpython/pymem.h"
+#  undef Py_CPYTHON_PYMEM_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_PYMEM_H
diff --git a/Include/pyport.h b/Include/pyport.h
new file mode 100644
index 0000000000000000000000000000000000000000..72a157e679d92ff5c70db7b98fe0456cf59ec4ce
--- /dev/null
+++ b/Include/pyport.h
@@ -0,0 +1,633 @@
+#ifndef Py_PYPORT_H
+#define Py_PYPORT_H
+
+#ifndef UCHAR_MAX
+#  error "<limits.h> header must define UCHAR_MAX"
+#endif
+#if UCHAR_MAX != 255
+#  error "Python's source code assumes C's unsigned char is an 8-bit type"
+#endif
+
+
+// Macro to use C++ static_cast<> in the Python C API.
+#ifdef __cplusplus
+#  define _Py_STATIC_CAST(type, expr) static_cast<type>(expr)
+#else
+#  define _Py_STATIC_CAST(type, expr) ((type)(expr))
+#endif
+// Macro to use the more powerful/dangerous C-style cast even in C++.
+#define _Py_CAST(type, expr) ((type)(expr))
+
+// Static inline functions should use _Py_NULL rather than using directly NULL
+// to prevent C++ compiler warnings. On C23 and newer and on C++11 and newer,
+// _Py_NULL is defined as nullptr.
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ > 201710L) \
+        || (defined(__cplusplus) && __cplusplus >= 201103)
+#  define _Py_NULL nullptr
+#else
+#  define _Py_NULL NULL
+#endif
+
+
+/* Defines to build Python and its standard library:
+ *
+ * - Py_BUILD_CORE: Build Python core. Give access to Python internals, but
+ *   should not be used by third-party modules.
+ * - Py_BUILD_CORE_BUILTIN: Build a Python stdlib module as a built-in module.
+ * - Py_BUILD_CORE_MODULE: Build a Python stdlib module as a dynamic library.
+ *
+ * Py_BUILD_CORE_BUILTIN and Py_BUILD_CORE_MODULE imply Py_BUILD_CORE.
+ *
+ * On Windows, Py_BUILD_CORE_MODULE exports "PyInit_xxx" symbol, whereas
+ * Py_BUILD_CORE_BUILTIN does not.
+ */
+#if defined(Py_BUILD_CORE_BUILTIN) && !defined(Py_BUILD_CORE)
+#  define Py_BUILD_CORE
+#endif
+#if defined(Py_BUILD_CORE_MODULE) && !defined(Py_BUILD_CORE)
+#  define Py_BUILD_CORE
+#endif
+
+
+/**************************************************************************
+Symbols and macros to supply platform-independent interfaces to basic
+C language & library operations whose spellings vary across platforms.
+
+Please try to make documentation here as clear as possible:  by definition,
+the stuff here is trying to illuminate C's darkest corners.
+
+Config #defines referenced here:
+
+SIGNED_RIGHT_SHIFT_ZERO_FILLS
+Meaning:  To be defined iff i>>j does not extend the sign bit when i is a
+          signed integral type and i < 0.
+Used in:  Py_ARITHMETIC_RIGHT_SHIFT
+
+Py_DEBUG
+Meaning:  Extra checks compiled in for debug mode.
+Used in:  Py_SAFE_DOWNCAST
+
+**************************************************************************/
+
+/* typedefs for some C9X-defined synonyms for integral types.
+ *
+ * The names in Python are exactly the same as the C9X names, except with a
+ * Py_ prefix.  Until C9X is universally implemented, this is the only way
+ * to ensure that Python gets reliable names that don't conflict with names
+ * in non-Python code that are playing their own tricks to define the C9X
+ * names.
+ *
+ * NOTE: don't go nuts here!  Python has no use for *most* of the C9X
+ * integral synonyms.  Only define the ones we actually need.
+ */
+
+/* long long is required. Ensure HAVE_LONG_LONG is defined for compatibility. */
+#ifndef HAVE_LONG_LONG
+#define HAVE_LONG_LONG 1
+#endif
+#ifndef PY_LONG_LONG
+#define PY_LONG_LONG long long
+/* If LLONG_MAX is defined in limits.h, use that. */
+#define PY_LLONG_MIN LLONG_MIN
+#define PY_LLONG_MAX LLONG_MAX
+#define PY_ULLONG_MAX ULLONG_MAX
+#endif
+
+#define PY_UINT32_T uint32_t
+#define PY_UINT64_T uint64_t
+
+/* Signed variants of the above */
+#define PY_INT32_T int32_t
+#define PY_INT64_T int64_t
+
+/* PYLONG_BITS_IN_DIGIT describes the number of bits per "digit" (limb) in the
+ * PyLongObject implementation (longintrepr.h). It's currently either 30 or 15,
+ * defaulting to 30. The 15-bit digit option may be removed in the future.
+ */
+#ifndef PYLONG_BITS_IN_DIGIT
+#define PYLONG_BITS_IN_DIGIT 30
+#endif
+
+/* uintptr_t is the C9X name for an unsigned integral type such that a
+ * legitimate void* can be cast to uintptr_t and then back to void* again
+ * without loss of information.  Similarly for intptr_t, wrt a signed
+ * integral type.
+ */
+typedef uintptr_t       Py_uintptr_t;
+typedef intptr_t        Py_intptr_t;
+
+/* Py_ssize_t is a signed integral type such that sizeof(Py_ssize_t) ==
+ * sizeof(size_t).  C99 doesn't define such a thing directly (size_t is an
+ * unsigned integral type).  See PEP 353 for details.
+ * PY_SSIZE_T_MAX is the largest positive value of type Py_ssize_t.
+ */
+#ifdef HAVE_PY_SSIZE_T
+
+#elif HAVE_SSIZE_T
+typedef ssize_t         Py_ssize_t;
+#   define PY_SSIZE_T_MAX SSIZE_MAX
+#elif SIZEOF_VOID_P == SIZEOF_SIZE_T
+typedef Py_intptr_t     Py_ssize_t;
+#   define PY_SSIZE_T_MAX INTPTR_MAX
+#else
+#   error "Python needs a typedef for Py_ssize_t in pyport.h."
+#endif
+
+/* Smallest negative value of type Py_ssize_t. */
+#define PY_SSIZE_T_MIN (-PY_SSIZE_T_MAX-1)
+
+/* Py_hash_t is the same size as a pointer. */
+#define SIZEOF_PY_HASH_T SIZEOF_SIZE_T
+typedef Py_ssize_t Py_hash_t;
+/* Py_uhash_t is the unsigned equivalent needed to calculate numeric hash. */
+#define SIZEOF_PY_UHASH_T SIZEOF_SIZE_T
+typedef size_t Py_uhash_t;
+
+/* Now PY_SSIZE_T_CLEAN is mandatory. This is just for backward compatibility. */
+typedef Py_ssize_t Py_ssize_clean_t;
+
+/* Largest possible value of size_t. */
+#define PY_SIZE_MAX SIZE_MAX
+
+/* Macro kept for backward compatibility: use directly "z" in new code.
+ *
+ * PY_FORMAT_SIZE_T is a modifier for use in a printf format to convert an
+ * argument with the width of a size_t or Py_ssize_t: "z" (C99).
+ */
+#ifndef PY_FORMAT_SIZE_T
+#   define PY_FORMAT_SIZE_T "z"
+#endif
+
+/* Py_LOCAL can be used instead of static to get the fastest possible calling
+ * convention for functions that are local to a given module.
+ *
+ * Py_LOCAL_INLINE does the same thing, and also explicitly requests inlining,
+ * for platforms that support that.
+ *
+ * NOTE: You can only use this for functions that are entirely local to a
+ * module; functions that are exported via method tables, callbacks, etc,
+ * should keep using static.
+ */
+
+#if defined(_MSC_VER)
+   /* ignore warnings if the compiler decides not to inline a function */
+#  pragma warning(disable: 4710)
+   /* fastest possible local call under MSVC */
+#  define Py_LOCAL(type) static type __fastcall
+#  define Py_LOCAL_INLINE(type) static __inline type __fastcall
+#else
+#  define Py_LOCAL(type) static type
+#  define Py_LOCAL_INLINE(type) static inline type
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_MEMCPY memcpy
+#endif
+
+#ifdef __cplusplus
+/* Move this down here since some C++ #include's don't like to be included
+   inside an extern "C" */
+extern "C" {
+#endif
+
+
+/* Py_ARITHMETIC_RIGHT_SHIFT
+ * C doesn't define whether a right-shift of a signed integer sign-extends
+ * or zero-fills.  Here a macro to force sign extension:
+ * Py_ARITHMETIC_RIGHT_SHIFT(TYPE, I, J)
+ *    Return I >> J, forcing sign extension.  Arithmetically, return the
+ *    floor of I/2**J.
+ * Requirements:
+ *    I should have signed integer type.  In the terminology of C99, this can
+ *    be either one of the five standard signed integer types (signed char,
+ *    short, int, long, long long) or an extended signed integer type.
+ *    J is an integer >= 0 and strictly less than the number of bits in the
+ *    type of I (because C doesn't define what happens for J outside that
+ *    range either).
+ *    TYPE used to specify the type of I, but is now ignored.  It's been left
+ *    in for backwards compatibility with versions <= 2.6 or 3.0.
+ * Caution:
+ *    I may be evaluated more than once.
+ */
+#ifdef SIGNED_RIGHT_SHIFT_ZERO_FILLS
+#define Py_ARITHMETIC_RIGHT_SHIFT(TYPE, I, J) \
+    ((I) < 0 ? -1-((-1-(I)) >> (J)) : (I) >> (J))
+#else
+#define Py_ARITHMETIC_RIGHT_SHIFT(TYPE, I, J) ((I) >> (J))
+#endif
+
+/* Py_FORCE_EXPANSION(X)
+ * "Simply" returns its argument.  However, macro expansions within the
+ * argument are evaluated.  This unfortunate trickery is needed to get
+ * token-pasting to work as desired in some cases.
+ */
+#define Py_FORCE_EXPANSION(X) X
+
+/* Py_SAFE_DOWNCAST(VALUE, WIDE, NARROW)
+ * Cast VALUE to type NARROW from type WIDE.  In Py_DEBUG mode, this
+ * assert-fails if any information is lost.
+ * Caution:
+ *    VALUE may be evaluated more than once.
+ */
+#ifdef Py_DEBUG
+#  define Py_SAFE_DOWNCAST(VALUE, WIDE, NARROW) \
+       (assert(_Py_STATIC_CAST(WIDE, _Py_STATIC_CAST(NARROW, (VALUE))) == (VALUE)), \
+        _Py_STATIC_CAST(NARROW, (VALUE)))
+#else
+#  define Py_SAFE_DOWNCAST(VALUE, WIDE, NARROW) _Py_STATIC_CAST(NARROW, (VALUE))
+#endif
+
+
+/* Py_DEPRECATED(version)
+ * Declare a variable, type, or function deprecated.
+ * The macro must be placed before the declaration.
+ * Usage:
+ *    Py_DEPRECATED(3.3) extern int old_var;
+ *    Py_DEPRECATED(3.4) typedef int T1;
+ *    Py_DEPRECATED(3.8) PyAPI_FUNC(int) Py_OldFunction(void);
+ */
+#if defined(__GNUC__) \
+    && ((__GNUC__ >= 4) || (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
+#define Py_DEPRECATED(VERSION_UNUSED) __attribute__((__deprecated__))
+#elif defined(_MSC_VER)
+#define Py_DEPRECATED(VERSION) __declspec(deprecated( \
+                                          "deprecated in " #VERSION))
+#else
+#define Py_DEPRECATED(VERSION_UNUSED)
+#endif
+
+// _Py_DEPRECATED_EXTERNALLY(version)
+// Deprecated outside CPython core.
+#ifdef Py_BUILD_CORE
+#define _Py_DEPRECATED_EXTERNALLY(VERSION_UNUSED)
+#else
+#define _Py_DEPRECATED_EXTERNALLY(version) Py_DEPRECATED(version)
+#endif
+
+
+#if defined(__clang__)
+#define _Py_COMP_DIAG_PUSH _Pragma("clang diagnostic push")
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#define _Py_COMP_DIAG_POP _Pragma("clang diagnostic pop")
+#elif defined(__GNUC__) \
+    && ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 6))
+#define _Py_COMP_DIAG_PUSH _Pragma("GCC diagnostic push")
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS \
+    _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#define _Py_COMP_DIAG_POP _Pragma("GCC diagnostic pop")
+#elif defined(_MSC_VER)
+#define _Py_COMP_DIAG_PUSH __pragma(warning(push))
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS __pragma(warning(disable: 4996))
+#define _Py_COMP_DIAG_POP __pragma(warning(pop))
+#else
+#define _Py_COMP_DIAG_PUSH
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS
+#define _Py_COMP_DIAG_POP
+#endif
+
+/* _Py_HOT_FUNCTION
+ * The hot attribute on a function is used to inform the compiler that the
+ * function is a hot spot of the compiled program. The function is optimized
+ * more aggressively and on many target it is placed into special subsection of
+ * the text section so all hot functions appears close together improving
+ * locality.
+ *
+ * Usage:
+ *    int _Py_HOT_FUNCTION x(void) { return 3; }
+ *
+ * Issue #28618: This attribute must not be abused, otherwise it can have a
+ * negative effect on performance. Only the functions were Python spend most of
+ * its time must use it. Use a profiler when running performance benchmark
+ * suite to find these functions.
+ */
+#if defined(__GNUC__) \
+    && ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 3))
+#define _Py_HOT_FUNCTION __attribute__((hot))
+#else
+#define _Py_HOT_FUNCTION
+#endif
+
+// Ask the compiler to always inline a static inline function. The compiler can
+// ignore it and decides to not inline the function.
+//
+// It can be used to inline performance critical static inline functions when
+// building Python in debug mode with function inlining disabled. For example,
+// MSC disables function inlining when building in debug mode.
+//
+// Marking blindly a static inline function with Py_ALWAYS_INLINE can result in
+// worse performances (due to increased code size for example). The compiler is
+// usually smarter than the developer for the cost/benefit analysis.
+//
+// If Python is built in debug mode (if the Py_DEBUG macro is defined), the
+// Py_ALWAYS_INLINE macro does nothing.
+//
+// It must be specified before the function return type. Usage:
+//
+//     static inline Py_ALWAYS_INLINE int random(void) { return 4; }
+#if defined(Py_DEBUG)
+   // If Python is built in debug mode, usually compiler optimizations are
+   // disabled. In this case, Py_ALWAYS_INLINE can increase a lot the stack
+   // memory usage. For example, forcing inlining using gcc -O0 increases the
+   // stack usage from 6 KB to 15 KB per Python function call.
+#  define Py_ALWAYS_INLINE
+#elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+#  define Py_ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define Py_ALWAYS_INLINE __forceinline
+#else
+#  define Py_ALWAYS_INLINE
+#endif
+
+// Py_NO_INLINE
+// Disable inlining on a function. For example, it reduces the C stack
+// consumption: useful on LTO+PGO builds which heavily inline code (see
+// bpo-33720).
+//
+// Usage:
+//
+//    Py_NO_INLINE static int random(void) { return 4; }
+#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+#  define Py_NO_INLINE __attribute__ ((noinline))
+#elif defined(_MSC_VER)
+#  define Py_NO_INLINE __declspec(noinline)
+#else
+#  define Py_NO_INLINE
+#endif
+
+#include "exports.h"
+
+#ifdef Py_LIMITED_API
+   // The internal C API must not be used with the limited C API: make sure
+   // that Py_BUILD_CORE macro is not defined in this case. These 3 macros are
+   // used by exports.h, so only undefine them afterwards.
+#  undef Py_BUILD_CORE
+#  undef Py_BUILD_CORE_BUILTIN
+#  undef Py_BUILD_CORE_MODULE
+#endif
+
+/* limits.h constants that may be missing */
+
+#ifndef INT_MAX
+#define INT_MAX 2147483647
+#endif
+
+#ifndef LONG_MAX
+#if SIZEOF_LONG == 4
+#define LONG_MAX 0X7FFFFFFFL
+#elif SIZEOF_LONG == 8
+#define LONG_MAX 0X7FFFFFFFFFFFFFFFL
+#else
+#error "could not set LONG_MAX in pyport.h"
+#endif
+#endif
+
+#ifndef LONG_MIN
+#define LONG_MIN (-LONG_MAX-1)
+#endif
+
+#ifndef LONG_BIT
+#define LONG_BIT (8 * SIZEOF_LONG)
+#endif
+
+#if LONG_BIT != 8 * SIZEOF_LONG
+/* 04-Oct-2000 LONG_BIT is apparently (mis)defined as 64 on some recent
+ * 32-bit platforms using gcc.  We try to catch that here at compile-time
+ * rather than waiting for integer multiplication to trigger bogus
+ * overflows.
+ */
+#error "LONG_BIT definition appears wrong for platform (bad gcc/glibc config?)."
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+/*
+ * Hide GCC attributes from compilers that don't support them.
+ */
+#if (!defined(__GNUC__) || __GNUC__ < 2 || \
+     (__GNUC__ == 2 && __GNUC_MINOR__ < 7) )
+#define Py_GCC_ATTRIBUTE(x)
+#else
+#define Py_GCC_ATTRIBUTE(x) __attribute__(x)
+#endif
+
+/*
+ * Specify alignment on compilers that support it.
+ */
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define Py_ALIGNED(x) __attribute__((aligned(x)))
+#else
+#define Py_ALIGNED(x)
+#endif
+
+/* Eliminate end-of-loop code not reached warnings from SunPro C
+ * when using do{...}while(0) macros
+ */
+#ifdef __SUNPRO_C
+#pragma error_messages (off,E_END_OF_LOOP_CODE_NOT_REACHED)
+#endif
+
+#ifndef Py_LL
+#define Py_LL(x) x##LL
+#endif
+
+#ifndef Py_ULL
+#define Py_ULL(x) Py_LL(x##U)
+#endif
+
+#define Py_VA_COPY va_copy
+
+/*
+ * Convenient macros to deal with endianness of the platform. WORDS_BIGENDIAN is
+ * detected by configure and defined in pyconfig.h. The code in pyconfig.h
+ * also takes care of Apple's universal builds.
+ */
+
+#ifdef WORDS_BIGENDIAN
+#  define PY_BIG_ENDIAN 1
+#  define PY_LITTLE_ENDIAN 0
+#else
+#  define PY_BIG_ENDIAN 0
+#  define PY_LITTLE_ENDIAN 1
+#endif
+
+#ifdef __ANDROID__
+   /* The Android langinfo.h header is not used. */
+#  undef HAVE_LANGINFO_H
+#  undef CODESET
+#endif
+
+/* Maximum value of the Windows DWORD type */
+#define PY_DWORD_MAX 4294967295U
+
+/* This macro used to tell whether Python was built with multithreading
+ * enabled.  Now multithreading is always enabled, but keep the macro
+ * for compatibility.
+ */
+#ifndef WITH_THREAD
+#  define WITH_THREAD
+#endif
+
+/* Some WebAssembly platforms do not provide a working pthread implementation.
+ * Thread support is stubbed and any attempt to create a new thread fails.
+ */
+#if (!defined(HAVE_PTHREAD_STUBS) && \
+      (!defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)))
+#  define Py_CAN_START_THREADS 1
+#endif
+
+#ifdef WITH_THREAD
+#  ifdef Py_BUILD_CORE
+#    ifdef HAVE_THREAD_LOCAL
+#      error "HAVE_THREAD_LOCAL is already defined"
+#    endif
+#    define HAVE_THREAD_LOCAL 1
+#    ifdef thread_local
+#      define _Py_thread_local thread_local
+#    elif __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+#      define _Py_thread_local _Thread_local
+#    elif defined(_MSC_VER)  /* AKA NT_THREADS */
+#      define _Py_thread_local __declspec(thread)
+#    elif defined(__GNUC__)  /* includes clang */
+#      define _Py_thread_local __thread
+#    else
+       // fall back to the PyThread_tss_*() API, or ignore.
+#      undef HAVE_THREAD_LOCAL
+#    endif
+#  endif
+#endif
+
+#if defined(__ANDROID__) || defined(__VXWORKS__)
+   // Use UTF-8 as the locale encoding, ignore the LC_CTYPE locale.
+   // See _Py_GetLocaleEncoding(), PyUnicode_DecodeLocale()
+   // and PyUnicode_EncodeLocale().
+#  define _Py_FORCE_UTF8_LOCALE
+#endif
+
+#if defined(_Py_FORCE_UTF8_LOCALE) || defined(__APPLE__)
+   // Use UTF-8 as the filesystem encoding.
+   // See PyUnicode_DecodeFSDefaultAndSize(), PyUnicode_EncodeFSDefault(),
+   // Py_DecodeLocale() and Py_EncodeLocale().
+#  define _Py_FORCE_UTF8_FS_ENCODING
+#endif
+
+/* Mark a function which cannot return. Example:
+   PyAPI_FUNC(void) _Py_NO_RETURN PyThread_exit_thread(void);
+
+   XLC support is intentionally omitted due to bpo-40244 */
+#ifndef _Py_NO_RETURN
+#if defined(__clang__) || \
+    (defined(__GNUC__) && \
+     ((__GNUC__ >= 3) || \
+      (__GNUC__ == 2) && (__GNUC_MINOR__ >= 5)))
+#  define _Py_NO_RETURN __attribute__((__noreturn__))
+#elif defined(_MSC_VER)
+#  define _Py_NO_RETURN __declspec(noreturn)
+#else
+#  define _Py_NO_RETURN
+#endif
+#endif
+
+
+// Preprocessor check for a builtin preprocessor function. Always return 0
+// if __has_builtin() macro is not defined.
+//
+// __has_builtin() is available on clang and GCC 10.
+#ifdef __has_builtin
+#  define _Py__has_builtin(x) __has_builtin(x)
+#else
+#  define _Py__has_builtin(x) 0
+#endif
+
+// Preprocessor check for a compiler __attribute__. Always return 0
+// if __has_attribute() macro is not defined.
+#ifdef __has_attribute
+#  define _Py__has_attribute(x) __has_attribute(x)
+#else
+#  define _Py__has_attribute(x) 0
+#endif
+
+// _Py_TYPEOF(expr) gets the type of an expression.
+//
+// Example: _Py_TYPEOF(x) x_copy = (x);
+//
+// The macro is only defined if GCC or clang compiler is used.
+#if defined(__GNUC__) || defined(__clang__)
+#  define _Py_TYPEOF(expr) __typeof__(expr)
+#endif
+
+
+/* A convenient way for code to know if sanitizers are enabled. */
+#if defined(__has_feature)
+#  if __has_feature(memory_sanitizer)
+#    if !defined(_Py_MEMORY_SANITIZER)
+#      define _Py_MEMORY_SANITIZER
+#    endif
+#  endif
+#  if __has_feature(address_sanitizer)
+#    if !defined(_Py_ADDRESS_SANITIZER)
+#      define _Py_ADDRESS_SANITIZER
+#    endif
+#  endif
+#  if __has_feature(thread_sanitizer)
+#    if !defined(_Py_THREAD_SANITIZER)
+#      define _Py_THREAD_SANITIZER
+#    endif
+#  endif
+#elif defined(__GNUC__)
+#  if defined(__SANITIZE_ADDRESS__)
+#    define _Py_ADDRESS_SANITIZER
+#  endif
+#  if defined(__SANITIZE_THREAD__)
+#    define _Py_THREAD_SANITIZER
+#  endif
+#endif
+
+
+/* AIX has __bool__ redefined in it's system header file. */
+#if defined(_AIX) && defined(__bool__)
+#undef __bool__
+#endif
+
+// Make sure we have maximum alignment, even if the current compiler
+// does not support max_align_t. Note that:
+// - Autoconf reports alignment of unknown types to 0.
+// - 'long double' has maximum alignment on *most* platforms,
+//   looks like the best we can do for pre-C11 compilers.
+// - The value is tested, see test_alignof_max_align_t
+#if !defined(ALIGNOF_MAX_ALIGN_T) || ALIGNOF_MAX_ALIGN_T == 0
+#   undef ALIGNOF_MAX_ALIGN_T
+#   define ALIGNOF_MAX_ALIGN_T _Alignof(long double)
+#endif
+
+#ifndef PY_CXX_CONST
+#  ifdef __cplusplus
+#    define PY_CXX_CONST const
+#  else
+#    define PY_CXX_CONST
+#  endif
+#endif
+
+#if defined(__sgi) && !defined(_SGI_MP_SOURCE)
+#  define _SGI_MP_SOURCE
+#endif
+
+
+// _Py_NONSTRING: The nonstring variable attribute specifies that an object or
+// member declaration with type array of char, signed char, or unsigned char,
+// or pointer to such a type is intended to store character arrays that do not
+// necessarily contain a terminating NUL.
+//
+// Usage:
+//
+//   char name [8] _Py_NONSTRING;
+#if _Py__has_attribute(nonstring)
+#  define _Py_NONSTRING __attribute__((nonstring))
+#else
+#  define _Py_NONSTRING
+#endif
+
+
+#endif /* Py_PYPORT_H */
diff --git a/Include/pystate.h b/Include/pystate.h
new file mode 100644
index 0000000000000000000000000000000000000000..727b8fbfffe0e674c48e3b0594dc5c54f2221f97
--- /dev/null
+++ b/Include/pystate.h
@@ -0,0 +1,132 @@
+/* Thread and interpreter state structures and their interfaces */
+
+
+#ifndef Py_PYSTATE_H
+#define Py_PYSTATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This limitation is for performance and simplicity. If needed it can be
+removed (with effort). */
+#define MAX_CO_EXTRA_USERS 255
+
+PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_New(void);
+PyAPI_FUNC(void) PyInterpreterState_Clear(PyInterpreterState *);
+PyAPI_FUNC(void) PyInterpreterState_Delete(PyInterpreterState *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+/* New in 3.9 */
+/* Get the current interpreter state.
+
+   Issue a fatal error if there no current Python thread state or no current
+   interpreter. It cannot return NULL.
+
+   The caller must hold the GIL. */
+PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_Get(void);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
+/* New in 3.8 */
+PyAPI_FUNC(PyObject *) PyInterpreterState_GetDict(PyInterpreterState *);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+/* New in 3.7 */
+PyAPI_FUNC(int64_t) PyInterpreterState_GetID(PyInterpreterState *);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+
+/* State unique per thread */
+
+/* New in 3.3 */
+PyAPI_FUNC(int) PyState_AddModule(PyObject*, PyModuleDef*);
+PyAPI_FUNC(int) PyState_RemoveModule(PyModuleDef*);
+#endif
+PyAPI_FUNC(PyObject*) PyState_FindModule(PyModuleDef*);
+
+PyAPI_FUNC(PyThreadState *) PyThreadState_New(PyInterpreterState *);
+PyAPI_FUNC(void) PyThreadState_Clear(PyThreadState *);
+PyAPI_FUNC(void) PyThreadState_Delete(PyThreadState *);
+
+/* Get the current thread state.
+
+   When the current thread state is NULL, this issues a fatal error (so that
+   the caller needn't check for NULL).
+
+   The caller must hold the GIL.
+
+   See also PyThreadState_GetUnchecked() and _PyThreadState_GET(). */
+PyAPI_FUNC(PyThreadState *) PyThreadState_Get(void);
+
+// Alias to PyThreadState_Get()
+#define PyThreadState_GET() PyThreadState_Get()
+
+PyAPI_FUNC(PyThreadState *) PyThreadState_Swap(PyThreadState *);
+PyAPI_FUNC(PyObject *) PyThreadState_GetDict(void);
+PyAPI_FUNC(int) PyThreadState_SetAsyncExc(unsigned long, PyObject *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+/* New in 3.9 */
+PyAPI_FUNC(PyInterpreterState*) PyThreadState_GetInterpreter(PyThreadState *tstate);
+PyAPI_FUNC(PyFrameObject*) PyThreadState_GetFrame(PyThreadState *tstate);
+PyAPI_FUNC(uint64_t) PyThreadState_GetID(PyThreadState *tstate);
+#endif
+
+typedef
+    enum {PyGILState_LOCKED, PyGILState_UNLOCKED}
+        PyGILState_STATE;
+
+
+/* Ensure that the current thread is ready to call the Python
+   C API, regardless of the current state of Python, or of its
+   thread lock.  This may be called as many times as desired
+   by a thread so long as each call is matched with a call to
+   PyGILState_Release().  In general, other thread-state APIs may
+   be used between _Ensure() and _Release() calls, so long as the
+   thread-state is restored to its previous state before the Release().
+   For example, normal use of the Py_BEGIN_ALLOW_THREADS/
+   Py_END_ALLOW_THREADS macros are acceptable.
+
+   The return value is an opaque "handle" to the thread state when
+   PyGILState_Ensure() was called, and must be passed to
+   PyGILState_Release() to ensure Python is left in the same state. Even
+   though recursive calls are allowed, these handles can *not* be shared -
+   each unique call to PyGILState_Ensure must save the handle for its
+   call to PyGILState_Release.
+
+   When the function returns, the current thread will hold the GIL.
+
+   Failure is a fatal error.
+*/
+PyAPI_FUNC(PyGILState_STATE) PyGILState_Ensure(void);
+
+/* Release any resources previously acquired.  After this call, Python's
+   state will be the same as it was prior to the corresponding
+   PyGILState_Ensure() call (but generally this state will be unknown to
+   the caller, hence the use of the GILState API.)
+
+   Every call to PyGILState_Ensure must be matched by a call to
+   PyGILState_Release on the same thread.
+*/
+PyAPI_FUNC(void) PyGILState_Release(PyGILState_STATE);
+
+/* Helper/diagnostic function - get the current thread state for
+   this thread.  May return NULL if no GILState API has been used
+   on the current thread.  Note that the main thread always has such a
+   thread-state, even if no auto-thread-state call has been made
+   on the main thread.
+*/
+PyAPI_FUNC(PyThreadState *) PyGILState_GetThisThreadState(void);
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYSTATE_H
+#  include "cpython/pystate.h"
+#  undef Py_CPYTHON_PYSTATE_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYSTATE_H */
diff --git a/Include/pystats.h b/Include/pystats.h
new file mode 100644
index 0000000000000000000000000000000000000000..acfa32201711e07b26188e62f78e1e58217dad09
--- /dev/null
+++ b/Include/pystats.h
@@ -0,0 +1,26 @@
+// Statistics on Python performance (public API).
+//
+// Define _Py_INCREF_STAT_INC() and _Py_DECREF_STAT_INC() used by Py_INCREF()
+// and Py_DECREF().
+//
+// See Include/cpython/pystats.h for the full API.
+
+#ifndef Py_PYSTATS_H
+#define Py_PYSTATS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(Py_STATS) && !defined(Py_LIMITED_API)
+#  define Py_CPYTHON_PYSTATS_H
+#  include "cpython/pystats.h"
+#  undef Py_CPYTHON_PYSTATS_H
+#else
+#  define _Py_INCREF_STAT_INC() ((void)0)
+#  define _Py_DECREF_STAT_INC() ((void)0)
+#endif  // !Py_STATS
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_PYSTATS_H
diff --git a/Include/pystrcmp.h b/Include/pystrcmp.h
new file mode 100644
index 0000000000000000000000000000000000000000..edb12397e3cbcc761a5ba28221215a62193a48ba
--- /dev/null
+++ b/Include/pystrcmp.h
@@ -0,0 +1,23 @@
+#ifndef Py_STRCMP_H
+#define Py_STRCMP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(int) PyOS_mystrnicmp(const char *, const char *, Py_ssize_t);
+PyAPI_FUNC(int) PyOS_mystricmp(const char *, const char *);
+
+#ifdef MS_WINDOWS
+#define PyOS_strnicmp strnicmp
+#define PyOS_stricmp stricmp
+#else
+#define PyOS_strnicmp PyOS_mystrnicmp
+#define PyOS_stricmp PyOS_mystricmp
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !Py_STRCMP_H */
diff --git a/Include/pystrtod.h b/Include/pystrtod.h
new file mode 100644
index 0000000000000000000000000000000000000000..e83d245eb623afabe3a5843e18bc0f6e2e5ba1b1
--- /dev/null
+++ b/Include/pystrtod.h
@@ -0,0 +1,37 @@
+#ifndef Py_STRTOD_H
+#define Py_STRTOD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+PyAPI_FUNC(double) PyOS_string_to_double(const char *str,
+                                         char **endptr,
+                                         PyObject *overflow_exception);
+
+/* The caller is responsible for calling PyMem_Free to free the buffer
+   that's is returned. */
+PyAPI_FUNC(char *) PyOS_double_to_string(double val,
+                                         char format_code,
+                                         int precision,
+                                         int flags,
+                                         int *type);
+
+/* PyOS_double_to_string's "flags" parameter can be set to 0 or more of: */
+#define Py_DTSF_SIGN      0x01 /* always add the sign */
+#define Py_DTSF_ADD_DOT_0 0x02 /* if the result is an integer add ".0" */
+#define Py_DTSF_ALT       0x04 /* "alternate" formatting. it's format_code
+                                  specific */
+#define Py_DTSF_NO_NEG_0  0x08 /* negative zero result is coerced to 0 */
+
+/* PyOS_double_to_string's "type", if non-NULL, will be set to one of: */
+#define Py_DTST_FINITE 0
+#define Py_DTST_INFINITE 1
+#define Py_DTST_NAN 2
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !Py_STRTOD_H */
diff --git a/Include/pythonrun.h b/Include/pythonrun.h
new file mode 100644
index 0000000000000000000000000000000000000000..154c7450cb934f9492b1f2957f8a55545d27fdf4
--- /dev/null
+++ b/Include/pythonrun.h
@@ -0,0 +1,49 @@
+
+/* Interfaces to parse and execute pieces of python code */
+
+#ifndef Py_PYTHONRUN_H
+#define Py_PYTHONRUN_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(PyObject *) Py_CompileString(const char *, const char *, int);
+
+PyAPI_FUNC(void) PyErr_Print(void);
+PyAPI_FUNC(void) PyErr_PrintEx(int);
+PyAPI_FUNC(void) PyErr_Display(PyObject *, PyObject *, PyObject *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030C0000
+PyAPI_FUNC(void) PyErr_DisplayException(PyObject *);
+#endif
+
+
+/* Stuff with no proper home (yet) */
+PyAPI_DATA(int) (*PyOS_InputHook)(void);
+
+/* Stack size, in "pointers" (so we get extra safety margins
+   on 64-bit platforms).  On a 32-bit platform, this translates
+   to an 8k margin. */
+#define PYOS_STACK_MARGIN 2048
+
+#if defined(WIN32) && !defined(MS_WIN64) && !defined(_M_ARM) && defined(_MSC_VER) && _MSC_VER >= 1300
+/* Enable stack checking under Microsoft C */
+// When changing the platforms, ensure PyOS_CheckStack() docs are still correct
+#define USE_STACKCHECK
+#endif
+
+#ifdef USE_STACKCHECK
+/* Check that we aren't overflowing our stack */
+PyAPI_FUNC(int) PyOS_CheckStack(void);
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYTHONRUN_H
+#  include "cpython/pythonrun.h"
+#  undef Py_CPYTHON_PYTHONRUN_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYTHONRUN_H */
diff --git a/Include/pythread.h b/Include/pythread.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3216c51d66165902b000a0fba0497f701a543d4
--- /dev/null
+++ b/Include/pythread.h
@@ -0,0 +1,112 @@
+#ifndef Py_PYTHREAD_H
+#define Py_PYTHREAD_H
+
+typedef void *PyThread_type_lock;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Return status codes for Python lock acquisition.  Chosen for maximum
+ * backwards compatibility, ie failure -> 0, success -> 1.  */
+typedef enum PyLockStatus {
+    PY_LOCK_FAILURE = 0,
+    PY_LOCK_ACQUIRED = 1,
+    PY_LOCK_INTR
+} PyLockStatus;
+
+PyAPI_FUNC(void) PyThread_init_thread(void);
+PyAPI_FUNC(unsigned long) PyThread_start_new_thread(void (*)(void *), void *);
+PyAPI_FUNC(void) _Py_NO_RETURN PyThread_exit_thread(void);
+PyAPI_FUNC(unsigned long) PyThread_get_thread_ident(void);
+
+#if (defined(__APPLE__) || defined(__linux__) || defined(_WIN32) \
+     || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) \
+     || defined(__OpenBSD__) || defined(__NetBSD__) \
+     || defined(__DragonFly__) || defined(_AIX))
+#define PY_HAVE_THREAD_NATIVE_ID
+PyAPI_FUNC(unsigned long) PyThread_get_thread_native_id(void);
+#endif
+
+PyAPI_FUNC(PyThread_type_lock) PyThread_allocate_lock(void);
+PyAPI_FUNC(void) PyThread_free_lock(PyThread_type_lock);
+PyAPI_FUNC(int) PyThread_acquire_lock(PyThread_type_lock, int);
+#define WAIT_LOCK       1
+#define NOWAIT_LOCK     0
+
+// PY_TIMEOUT_T is the integral type used to specify timeouts when waiting
+// on a lock (see PyThread_acquire_lock_timed() below).
+#define PY_TIMEOUT_T long long
+
+
+/* If microseconds == 0, the call is non-blocking: it returns immediately
+   even when the lock can't be acquired.
+   If microseconds > 0, the call waits up to the specified duration.
+   If microseconds < 0, the call waits until success (or abnormal failure)
+
+   If *microseconds* is greater than PY_TIMEOUT_MAX, clamp the timeout to
+   PY_TIMEOUT_MAX microseconds.
+
+   If intr_flag is true and the acquire is interrupted by a signal, then the
+   call will return PY_LOCK_INTR.  The caller may reattempt to acquire the
+   lock.
+*/
+PyAPI_FUNC(PyLockStatus) PyThread_acquire_lock_timed(PyThread_type_lock,
+                                                     PY_TIMEOUT_T microseconds,
+                                                     int intr_flag);
+
+PyAPI_FUNC(void) PyThread_release_lock(PyThread_type_lock);
+
+PyAPI_FUNC(size_t) PyThread_get_stacksize(void);
+PyAPI_FUNC(int) PyThread_set_stacksize(size_t);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject*) PyThread_GetInfo(void);
+#endif
+
+
+/* Thread Local Storage (TLS) API
+   TLS API is DEPRECATED.  Use Thread Specific Storage (TSS) API.
+
+   The existing TLS API has used int to represent TLS keys across all
+   platforms, but it is not POSIX-compliant.  Therefore, the new TSS API uses
+   opaque data type to represent TSS keys to be compatible (see PEP 539).
+*/
+Py_DEPRECATED(3.7) PyAPI_FUNC(int) PyThread_create_key(void);
+Py_DEPRECATED(3.7) PyAPI_FUNC(void) PyThread_delete_key(int key);
+Py_DEPRECATED(3.7) PyAPI_FUNC(int) PyThread_set_key_value(int key,
+                                                          void *value);
+Py_DEPRECATED(3.7) PyAPI_FUNC(void *) PyThread_get_key_value(int key);
+Py_DEPRECATED(3.7) PyAPI_FUNC(void) PyThread_delete_key_value(int key);
+
+/* Cleanup after a fork */
+Py_DEPRECATED(3.7) PyAPI_FUNC(void) PyThread_ReInitTLS(void);
+
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+/* New in 3.7 */
+/* Thread Specific Storage (TSS) API */
+
+typedef struct _Py_tss_t Py_tss_t;  /* opaque */
+
+PyAPI_FUNC(Py_tss_t *) PyThread_tss_alloc(void);
+PyAPI_FUNC(void) PyThread_tss_free(Py_tss_t *key);
+
+/* The parameter key must not be NULL. */
+PyAPI_FUNC(int) PyThread_tss_is_created(Py_tss_t *key);
+PyAPI_FUNC(int) PyThread_tss_create(Py_tss_t *key);
+PyAPI_FUNC(void) PyThread_tss_delete(Py_tss_t *key);
+PyAPI_FUNC(int) PyThread_tss_set(Py_tss_t *key, void *value);
+PyAPI_FUNC(void *) PyThread_tss_get(Py_tss_t *key);
+#endif  /* New in 3.7 */
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYTHREAD_H
+#  include "cpython/pythread.h"
+#  undef Py_CPYTHON_PYTHREAD_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYTHREAD_H */
diff --git a/Include/pytypedefs.h b/Include/pytypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..e78ed56a3b67cd19a6fbce550cb9b145c719ed31
--- /dev/null
+++ b/Include/pytypedefs.h
@@ -0,0 +1,30 @@
+// Forward declarations of types of the Python C API.
+// Declare them at the same place since redefining typedef is a C11 feature.
+// Only use a forward declaration if there is an interdependency between two
+// header files.
+
+#ifndef Py_PYTYPEDEFS_H
+#define Py_PYTYPEDEFS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PyModuleDef PyModuleDef;
+typedef struct PyModuleDef_Slot PyModuleDef_Slot;
+typedef struct PyMethodDef PyMethodDef;
+typedef struct PyGetSetDef PyGetSetDef;
+typedef struct PyMemberDef PyMemberDef;
+
+typedef struct _object PyObject;
+typedef struct _longobject PyLongObject;
+typedef struct _typeobject PyTypeObject;
+typedef struct PyCodeObject PyCodeObject;
+typedef struct _frame PyFrameObject;
+
+typedef struct _ts PyThreadState;
+typedef struct _is PyInterpreterState;
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_PYTYPEDEFS_H
diff --git a/Include/rangeobject.h b/Include/rangeobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..d46ce7cd41b7417f877a5277c32bb3ef033c715a
--- /dev/null
+++ b/Include/rangeobject.h
@@ -0,0 +1,27 @@
+
+/* Range object interface */
+
+#ifndef Py_RANGEOBJECT_H
+#define Py_RANGEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+A range object represents an integer range.  This is an immutable object;
+a range cannot change its value after creation.
+
+Range objects behave like the corresponding tuple objects except that
+they are represented by a start, stop, and step datamembers.
+*/
+
+PyAPI_DATA(PyTypeObject) PyRange_Type;
+PyAPI_DATA(PyTypeObject) PyRangeIter_Type;
+PyAPI_DATA(PyTypeObject) PyLongRangeIter_Type;
+
+#define PyRange_Check(op) Py_IS_TYPE((op), &PyRange_Type)
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_RANGEOBJECT_H */
diff --git a/Include/setobject.h b/Include/setobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..62c9e6b13f89015c02c09463612c3e608e485d40
--- /dev/null
+++ b/Include/setobject.h
@@ -0,0 +1,49 @@
+/* Set object interface */
+
+#ifndef Py_SETOBJECT_H
+#define Py_SETOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PySet_Type;
+PyAPI_DATA(PyTypeObject) PyFrozenSet_Type;
+PyAPI_DATA(PyTypeObject) PySetIter_Type;
+
+PyAPI_FUNC(PyObject *) PySet_New(PyObject *);
+PyAPI_FUNC(PyObject *) PyFrozenSet_New(PyObject *);
+
+PyAPI_FUNC(int) PySet_Add(PyObject *set, PyObject *key);
+PyAPI_FUNC(int) PySet_Clear(PyObject *set);
+PyAPI_FUNC(int) PySet_Contains(PyObject *anyset, PyObject *key);
+PyAPI_FUNC(int) PySet_Discard(PyObject *set, PyObject *key);
+PyAPI_FUNC(PyObject *) PySet_Pop(PyObject *set);
+PyAPI_FUNC(Py_ssize_t) PySet_Size(PyObject *anyset);
+
+#define PyFrozenSet_CheckExact(ob) Py_IS_TYPE((ob), &PyFrozenSet_Type)
+#define PyFrozenSet_Check(ob) \
+    (Py_IS_TYPE((ob), &PyFrozenSet_Type) || \
+      PyType_IsSubtype(Py_TYPE(ob), &PyFrozenSet_Type))
+
+#define PyAnySet_CheckExact(ob) \
+    (Py_IS_TYPE((ob), &PySet_Type) || Py_IS_TYPE((ob), &PyFrozenSet_Type))
+#define PyAnySet_Check(ob) \
+    (Py_IS_TYPE((ob), &PySet_Type) || Py_IS_TYPE((ob), &PyFrozenSet_Type) || \
+      PyType_IsSubtype(Py_TYPE(ob), &PySet_Type) || \
+      PyType_IsSubtype(Py_TYPE(ob), &PyFrozenSet_Type))
+
+#define PySet_CheckExact(op) Py_IS_TYPE(op, &PySet_Type)
+#define PySet_Check(ob) \
+    (Py_IS_TYPE((ob), &PySet_Type) || \
+    PyType_IsSubtype(Py_TYPE(ob), &PySet_Type))
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_SETOBJECT_H
+#  include "cpython/setobject.h"
+#  undef Py_CPYTHON_SETOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_SETOBJECT_H */
diff --git a/Include/sliceobject.h b/Include/sliceobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..35e2ea254ca80a4747c873ec00e1ae3b7dd01008
--- /dev/null
+++ b/Include/sliceobject.h
@@ -0,0 +1,69 @@
+#ifndef Py_SLICEOBJECT_H
+#define Py_SLICEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* The unique ellipsis object "..." */
+
+PyAPI_DATA(PyObject) _Py_EllipsisObject; /* Don't use this directly */
+
+#if defined(Py_LIMITED_API) && Py_LIMITED_API+0 >= 0x030D0000
+#  define Py_Ellipsis Py_GetConstantBorrowed(Py_CONSTANT_ELLIPSIS)
+#else
+#  define Py_Ellipsis (&_Py_EllipsisObject)
+#endif
+
+/* Slice object interface */
+
+/*
+
+A slice object containing start, stop, and step data members (the
+names are from range).  After much talk with Guido, it was decided to
+let these be any arbitrary python type.  Py_None stands for omitted values.
+*/
+#ifndef Py_LIMITED_API
+typedef struct {
+    PyObject_HEAD
+    PyObject *start, *stop, *step;      /* not NULL */
+} PySliceObject;
+#endif
+
+PyAPI_DATA(PyTypeObject) PySlice_Type;
+PyAPI_DATA(PyTypeObject) PyEllipsis_Type;
+
+#define PySlice_Check(op) Py_IS_TYPE((op), &PySlice_Type)
+
+PyAPI_FUNC(PyObject *) PySlice_New(PyObject* start, PyObject* stop,
+                                  PyObject* step);
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(PyObject *) _PySlice_FromIndices(Py_ssize_t start, Py_ssize_t stop);
+PyAPI_FUNC(int) _PySlice_GetLongIndices(PySliceObject *self, PyObject *length,
+                                 PyObject **start_ptr, PyObject **stop_ptr,
+                                 PyObject **step_ptr);
+#endif
+PyAPI_FUNC(int) PySlice_GetIndices(PyObject *r, Py_ssize_t length,
+                                  Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step);
+Py_DEPRECATED(3.7)
+PyAPI_FUNC(int) PySlice_GetIndicesEx(PyObject *r, Py_ssize_t length,
+                                     Py_ssize_t *start, Py_ssize_t *stop,
+                                     Py_ssize_t *step,
+                                     Py_ssize_t *slicelength);
+
+#if !defined(Py_LIMITED_API) || (Py_LIMITED_API+0 >= 0x03050400 && Py_LIMITED_API+0 < 0x03060000) || Py_LIMITED_API+0 >= 0x03060100
+#define PySlice_GetIndicesEx(slice, length, start, stop, step, slicelen) (  \
+    PySlice_Unpack((slice), (start), (stop), (step)) < 0 ?                  \
+    ((*(slicelen) = 0), -1) :                                               \
+    ((*(slicelen) = PySlice_AdjustIndices((length), (start), (stop), *(step))), \
+     0))
+PyAPI_FUNC(int) PySlice_Unpack(PyObject *slice,
+                               Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step);
+PyAPI_FUNC(Py_ssize_t) PySlice_AdjustIndices(Py_ssize_t length,
+                                             Py_ssize_t *start, Py_ssize_t *stop,
+                                             Py_ssize_t step);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_SLICEOBJECT_H */
diff --git a/Include/structmember.h b/Include/structmember.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6e8fd829892f41cc7789123b84a5bb3c15e0485
--- /dev/null
+++ b/Include/structmember.h
@@ -0,0 +1,56 @@
+#ifndef Py_STRUCTMEMBER_H
+#define Py_STRUCTMEMBER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Interface to map C struct members to Python object attributes
+ *
+ * This header is deprecated: new code should not use stuff from here.
+ * New definitions are in descrobject.h.
+ *
+ * However, there's nothing wrong with old code continuing to use it,
+ * and there's not much mainenance overhead in maintaining a few aliases.
+ * So, don't be too eager to convert old code.
+ *
+ * It uses names not prefixed with Py_.
+ * It is also *not* included from Python.h and must be included individually.
+ */
+
+#include <stddef.h> /* For offsetof (not always provided by Python.h) */
+
+/* Types */
+#define T_SHORT     Py_T_SHORT
+#define T_INT       Py_T_INT
+#define T_LONG      Py_T_LONG
+#define T_FLOAT     Py_T_FLOAT
+#define T_DOUBLE    Py_T_DOUBLE
+#define T_STRING    Py_T_STRING
+#define T_OBJECT    _Py_T_OBJECT
+#define T_CHAR      Py_T_CHAR
+#define T_BYTE      Py_T_BYTE
+#define T_UBYTE     Py_T_UBYTE
+#define T_USHORT    Py_T_USHORT
+#define T_UINT      Py_T_UINT
+#define T_ULONG     Py_T_ULONG
+#define T_STRING_INPLACE    Py_T_STRING_INPLACE
+#define T_BOOL      Py_T_BOOL
+#define T_OBJECT_EX Py_T_OBJECT_EX
+#define T_LONGLONG  Py_T_LONGLONG
+#define T_ULONGLONG Py_T_ULONGLONG
+#define T_PYSSIZET  Py_T_PYSSIZET
+#define T_NONE      _Py_T_NONE
+
+/* Flags */
+#define READONLY            Py_READONLY
+#define PY_AUDIT_READ        Py_AUDIT_READ
+#define READ_RESTRICTED     Py_AUDIT_READ
+#define PY_WRITE_RESTRICTED _Py_WRITE_RESTRICTED
+#define RESTRICTED          (READ_RESTRICTED | PY_WRITE_RESTRICTED)
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_STRUCTMEMBER_H */
diff --git a/Include/structseq.h b/Include/structseq.h
new file mode 100644
index 0000000000000000000000000000000000000000..29e24fee54e6135ee93be1f3faf4d4b7d914c8f3
--- /dev/null
+++ b/Include/structseq.h
@@ -0,0 +1,46 @@
+
+/* Named tuple object interface */
+
+#ifndef Py_STRUCTSEQ_H
+#define Py_STRUCTSEQ_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PyStructSequence_Field {
+    const char *name;
+    const char *doc;
+} PyStructSequence_Field;
+
+typedef struct PyStructSequence_Desc {
+    const char *name;
+    const char *doc;
+    PyStructSequence_Field *fields;
+    int n_in_sequence;
+} PyStructSequence_Desc;
+
+PyAPI_DATA(const char * const) PyStructSequence_UnnamedField;
+
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(void) PyStructSequence_InitType(PyTypeObject *type,
+                                           PyStructSequence_Desc *desc);
+PyAPI_FUNC(int) PyStructSequence_InitType2(PyTypeObject *type,
+                                           PyStructSequence_Desc *desc);
+#endif
+PyAPI_FUNC(PyTypeObject*) PyStructSequence_NewType(PyStructSequence_Desc *desc);
+
+PyAPI_FUNC(PyObject *) PyStructSequence_New(PyTypeObject* type);
+
+PyAPI_FUNC(void) PyStructSequence_SetItem(PyObject*, Py_ssize_t, PyObject*);
+PyAPI_FUNC(PyObject*) PyStructSequence_GetItem(PyObject*, Py_ssize_t);
+
+#ifndef Py_LIMITED_API
+typedef PyTupleObject PyStructSequence;
+#define PyStructSequence_SET_ITEM PyStructSequence_SetItem
+#define PyStructSequence_GET_ITEM PyStructSequence_GetItem
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_STRUCTSEQ_H */
diff --git a/Include/sysmodule.h b/Include/sysmodule.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a0af2e1578eb70225d56f4e22d7d5ffa7eab1d4
--- /dev/null
+++ b/Include/sysmodule.h
@@ -0,0 +1,44 @@
+#ifndef Py_SYSMODULE_H
+#define Py_SYSMODULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(PyObject *) PySys_GetObject(const char *);
+PyAPI_FUNC(int) PySys_SetObject(const char *, PyObject *);
+
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) PySys_SetArgv(int, wchar_t **);
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) PySys_SetArgvEx(int, wchar_t **, int);
+
+PyAPI_FUNC(void) PySys_WriteStdout(const char *format, ...)
+                 Py_GCC_ATTRIBUTE((format(printf, 1, 2)));
+PyAPI_FUNC(void) PySys_WriteStderr(const char *format, ...)
+                 Py_GCC_ATTRIBUTE((format(printf, 1, 2)));
+PyAPI_FUNC(void) PySys_FormatStdout(const char *format, ...);
+PyAPI_FUNC(void) PySys_FormatStderr(const char *format, ...);
+
+Py_DEPRECATED(3.13) PyAPI_FUNC(void) PySys_ResetWarnOptions(void);
+
+PyAPI_FUNC(PyObject *) PySys_GetXOptions(void);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030d0000
+PyAPI_FUNC(int) PySys_Audit(
+    const char *event,
+    const char *argFormat,
+    ...);
+
+PyAPI_FUNC(int) PySys_AuditTuple(
+    const char *event,
+    PyObject *args);
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_SYSMODULE_H
+#  include "cpython/sysmodule.h"
+#  undef Py_CPYTHON_SYSMODULE_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_SYSMODULE_H */
diff --git a/Include/traceback.h b/Include/traceback.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b40cc9fc3261708db590f523dc767acf3742531
--- /dev/null
+++ b/Include/traceback.h
@@ -0,0 +1,26 @@
+#ifndef Py_TRACEBACK_H
+#define Py_TRACEBACK_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Traceback interface */
+
+PyAPI_FUNC(int) PyTraceBack_Here(PyFrameObject *);
+PyAPI_FUNC(int) PyTraceBack_Print(PyObject *, PyObject *);
+
+/* Reveal traceback type so we can typecheck traceback objects */
+PyAPI_DATA(PyTypeObject) PyTraceBack_Type;
+#define PyTraceBack_Check(v) Py_IS_TYPE((v), &PyTraceBack_Type)
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_TRACEBACK_H
+#  include "cpython/traceback.h"
+#  undef Py_CPYTHON_TRACEBACK_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TRACEBACK_H */
diff --git a/Include/tupleobject.h b/Include/tupleobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f9ab54be65f87e5e78b1ea192a27886b2e2cbdb
--- /dev/null
+++ b/Include/tupleobject.h
@@ -0,0 +1,46 @@
+/* Tuple object interface */
+
+#ifndef Py_TUPLEOBJECT_H
+#define Py_TUPLEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+Another generally useful object type is a tuple of object pointers.
+For Python, this is an immutable type.  C code can change the tuple items
+(but not their number), and even use tuples as general-purpose arrays of
+object references, but in general only brand new tuples should be mutated,
+not ones that might already have been exposed to Python code.
+
+*** WARNING *** PyTuple_SetItem does not increment the new item's reference
+count, but does decrement the reference count of the item it replaces,
+if not nil.  It does *decrement* the reference count if it is *not*
+inserted in the tuple.  Similarly, PyTuple_GetItem does not increment the
+returned item's reference count.
+*/
+
+PyAPI_DATA(PyTypeObject) PyTuple_Type;
+PyAPI_DATA(PyTypeObject) PyTupleIter_Type;
+
+#define PyTuple_Check(op) \
+                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_TUPLE_SUBCLASS)
+#define PyTuple_CheckExact(op) Py_IS_TYPE((op), &PyTuple_Type)
+
+PyAPI_FUNC(PyObject *) PyTuple_New(Py_ssize_t size);
+PyAPI_FUNC(Py_ssize_t) PyTuple_Size(PyObject *);
+PyAPI_FUNC(PyObject *) PyTuple_GetItem(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyTuple_SetItem(PyObject *, Py_ssize_t, PyObject *);
+PyAPI_FUNC(PyObject *) PyTuple_GetSlice(PyObject *, Py_ssize_t, Py_ssize_t);
+PyAPI_FUNC(PyObject *) PyTuple_Pack(Py_ssize_t, ...);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_TUPLEOBJECT_H
+#  include "cpython/tupleobject.h"
+#  undef Py_CPYTHON_TUPLEOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TUPLEOBJECT_H */
diff --git a/Include/typeslots.h b/Include/typeslots.h
new file mode 100644
index 0000000000000000000000000000000000000000..506b05580de146bbcae3e25e1e9ec18c773e9014
--- /dev/null
+++ b/Include/typeslots.h
@@ -0,0 +1,88 @@
+/* Do not renumber the file; these numbers are part of the stable ABI. */
+#define Py_bf_getbuffer 1
+#define Py_bf_releasebuffer 2
+#define Py_mp_ass_subscript 3
+#define Py_mp_length 4
+#define Py_mp_subscript 5
+#define Py_nb_absolute 6
+#define Py_nb_add 7
+#define Py_nb_and 8
+#define Py_nb_bool 9
+#define Py_nb_divmod 10
+#define Py_nb_float 11
+#define Py_nb_floor_divide 12
+#define Py_nb_index 13
+#define Py_nb_inplace_add 14
+#define Py_nb_inplace_and 15
+#define Py_nb_inplace_floor_divide 16
+#define Py_nb_inplace_lshift 17
+#define Py_nb_inplace_multiply 18
+#define Py_nb_inplace_or 19
+#define Py_nb_inplace_power 20
+#define Py_nb_inplace_remainder 21
+#define Py_nb_inplace_rshift 22
+#define Py_nb_inplace_subtract 23
+#define Py_nb_inplace_true_divide 24
+#define Py_nb_inplace_xor 25
+#define Py_nb_int 26
+#define Py_nb_invert 27
+#define Py_nb_lshift 28
+#define Py_nb_multiply 29
+#define Py_nb_negative 30
+#define Py_nb_or 31
+#define Py_nb_positive 32
+#define Py_nb_power 33
+#define Py_nb_remainder 34
+#define Py_nb_rshift 35
+#define Py_nb_subtract 36
+#define Py_nb_true_divide 37
+#define Py_nb_xor 38
+#define Py_sq_ass_item 39
+#define Py_sq_concat 40
+#define Py_sq_contains 41
+#define Py_sq_inplace_concat 42
+#define Py_sq_inplace_repeat 43
+#define Py_sq_item 44
+#define Py_sq_length 45
+#define Py_sq_repeat 46
+#define Py_tp_alloc 47
+#define Py_tp_base 48
+#define Py_tp_bases 49
+#define Py_tp_call 50
+#define Py_tp_clear 51
+#define Py_tp_dealloc 52
+#define Py_tp_del 53
+#define Py_tp_descr_get 54
+#define Py_tp_descr_set 55
+#define Py_tp_doc 56
+#define Py_tp_getattr 57
+#define Py_tp_getattro 58
+#define Py_tp_hash 59
+#define Py_tp_init 60
+#define Py_tp_is_gc 61
+#define Py_tp_iter 62
+#define Py_tp_iternext 63
+#define Py_tp_methods 64
+#define Py_tp_new 65
+#define Py_tp_repr 66
+#define Py_tp_richcompare 67
+#define Py_tp_setattr 68
+#define Py_tp_setattro 69
+#define Py_tp_str 70
+#define Py_tp_traverse 71
+#define Py_tp_members 72
+#define Py_tp_getset 73
+#define Py_tp_free 74
+#define Py_nb_matrix_multiply 75
+#define Py_nb_inplace_matrix_multiply 76
+#define Py_am_await 77
+#define Py_am_aiter 78
+#define Py_am_anext 79
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* New in 3.5 */
+#define Py_tp_finalize 80
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+/* New in 3.10 */
+#define Py_am_send 81
+#endif
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..dee00715b3c51d576da7bfbcd84ae362f5f14a0b
--- /dev/null
+++ b/Include/unicodeobject.h
@@ -0,0 +1,1021 @@
+#ifndef Py_UNICODEOBJECT_H
+#define Py_UNICODEOBJECT_H
+
+/*
+
+Unicode implementation based on original code by Fredrik Lundh,
+modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
+Unicode Integration Proposal. (See
+http://www.egenix.com/files/python/unicode-proposal.txt).
+
+Copyright (c) Corporation for National Research Initiatives.
+
+
+ Original header:
+ --------------------------------------------------------------------
+
+ * Yet another Unicode string type for Python.  This type supports the
+ * 16-bit Basic Multilingual Plane (BMP) only.
+ *
+ * Written by Fredrik Lundh, January 1999.
+ *
+ * Copyright (c) 1999 by Secret Labs AB.
+ * Copyright (c) 1999 by Fredrik Lundh.
+ *
+ * fredrik@pythonware.com
+ * http://www.pythonware.com
+ *
+ * --------------------------------------------------------------------
+ * This Unicode String Type is
+ *
+ * Copyright (c) 1999 by Secret Labs AB
+ * Copyright (c) 1999 by Fredrik Lundh
+ *
+ * By obtaining, using, and/or copying this software and/or its
+ * associated documentation, you agree that you have read, understood,
+ * and will comply with the following terms and conditions:
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * associated documentation for any purpose and without fee is hereby
+ * granted, provided that the above copyright notice appears in all
+ * copies, and that both that copyright notice and this permission notice
+ * appear in supporting documentation, and that the name of Secret Labs
+ * AB or the author not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission.
+ *
+ * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
+ * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ * -------------------------------------------------------------------- */
+
+/* === Internal API ======================================================= */
+
+/* --- Internal Unicode Format -------------------------------------------- */
+
+/* Python 3.x requires unicode */
+#define Py_USING_UNICODE
+
+#ifndef SIZEOF_WCHAR_T
+#error Must define SIZEOF_WCHAR_T
+#endif
+
+#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
+
+/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
+   Otherwise, Unicode strings are stored as UCS-2 (with limited support
+   for UTF-16) */
+
+#if Py_UNICODE_SIZE >= 4
+#define Py_UNICODE_WIDE
+#endif
+
+/* Set these flags if the platform has "wchar.h" and the
+   wchar_t type is a 16-bit unsigned type */
+/* #define HAVE_WCHAR_H */
+/* #define HAVE_USABLE_WCHAR_T */
+
+/* If the compiler provides a wchar_t type we try to support it
+   through the interface functions PyUnicode_FromWideChar(),
+   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
+
+#ifdef HAVE_USABLE_WCHAR_T
+# ifndef HAVE_WCHAR_H
+#  define HAVE_WCHAR_H
+# endif
+#endif
+
+/* Py_UCS4 and Py_UCS2 are typedefs for the respective
+   unicode representations. */
+typedef uint32_t Py_UCS4;
+typedef uint16_t Py_UCS2;
+typedef uint8_t Py_UCS1;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+PyAPI_DATA(PyTypeObject) PyUnicode_Type;
+PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
+
+#define PyUnicode_Check(op) \
+    PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
+#define PyUnicode_CheckExact(op) Py_IS_TYPE((op), &PyUnicode_Type)
+
+/* --- Constants ---------------------------------------------------------- */
+
+/* This Unicode character will be used as replacement character during
+   decoding if the errors argument is set to "replace". Note: the
+   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
+   Unicode 3.0. */
+
+#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
+
+/* === Public API ========================================================= */
+
+/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
+PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
+    const char *u,             /* UTF-8 encoded string */
+    Py_ssize_t size            /* size of buffer */
+    );
+
+/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
+   UTF-8 encoded bytes.  The size is determined with strlen(). */
+PyAPI_FUNC(PyObject*) PyUnicode_FromString(
+    const char *u              /* UTF-8 encoded string */
+    );
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject*) PyUnicode_Substring(
+    PyObject *str,
+    Py_ssize_t start,
+    Py_ssize_t end);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+/* Copy the string into a UCS4 buffer including the null character if copy_null
+   is set. Return NULL and raise an exception on error. Raise a SystemError if
+   the buffer is smaller than the string. Return buffer on success.
+
+   buflen is the length of the buffer in (Py_UCS4) characters. */
+PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
+    PyObject *unicode,
+    Py_UCS4* buffer,
+    Py_ssize_t buflen,
+    int copy_null);
+
+/* Copy the string into a UCS4 buffer. A new buffer is allocated using
+ * PyMem_Malloc; if this fails, NULL is returned with a memory error
+   exception set. */
+PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+/* Get the length of the Unicode object. */
+
+PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
+    PyObject *unicode
+);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+/* Read a character from the string. */
+
+PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
+    PyObject *unicode,
+    Py_ssize_t index
+    );
+
+/* Write a character to the string. The string must have been created through
+   PyUnicode_New, must not be shared, and must not have been hashed yet.
+
+   Return 0 on success, -1 on error. */
+
+PyAPI_FUNC(int) PyUnicode_WriteChar(
+    PyObject *unicode,
+    Py_ssize_t index,
+    Py_UCS4 character
+    );
+#endif
+
+/* Resize a Unicode object. The length is the number of codepoints.
+
+   *unicode is modified to point to the new (resized) object and 0
+   returned on success.
+
+   Try to resize the string in place (which is usually faster than allocating
+   a new string and copy characters), or create a new string.
+
+   Error handling is implemented as follows: an exception is set, -1
+   is returned and *unicode left untouched.
+
+   WARNING: The function doesn't check string content, the result may not be a
+            string in canonical representation. */
+
+PyAPI_FUNC(int) PyUnicode_Resize(
+    PyObject **unicode,         /* Pointer to the Unicode object */
+    Py_ssize_t length           /* New length */
+    );
+
+/* Decode obj to a Unicode object.
+
+   bytes, bytearray and other bytes-like objects are decoded according to the
+   given encoding and error handler. The encoding and error handler can be
+   NULL to have the interface use UTF-8 and "strict".
+
+   All other objects (including Unicode objects) raise an exception.
+
+   The API returns NULL in case of an error. The caller is responsible
+   for decref'ing the returned objects.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
+    PyObject *obj,              /* Object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Copy an instance of a Unicode subtype to a new true Unicode object if
+   necessary. If obj is already a true Unicode object (not a subtype), return
+   the reference with *incremented* refcount.
+
+   The API returns NULL in case of an error. The caller is responsible
+   for decref'ing the returned objects.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
+    PyObject *obj      /* Object */
+    );
+
+PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
+    const char *format,   /* ASCII-encoded string  */
+    va_list vargs
+    );
+PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
+    const char *format,   /* ASCII-encoded string  */
+    ...
+    );
+
+PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
+PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
+    const char *u              /* UTF-8 encoded string */
+    );
+
+/* --- wchar_t support for platforms which support it --------------------- */
+
+#ifdef HAVE_WCHAR_H
+
+/* Create a Unicode Object from the wchar_t buffer w of the given
+   size.
+
+   The buffer is copied into the new object. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
+    const wchar_t *w,           /* wchar_t buffer */
+    Py_ssize_t size             /* size of buffer */
+    );
+
+/* Copies the Unicode Object contents into the wchar_t buffer w.  At
+   most size wchar_t characters are copied.
+
+   Note that the resulting wchar_t string may or may not be
+   0-terminated.  It is the responsibility of the caller to make sure
+   that the wchar_t string is 0-terminated in case this is required by
+   the application.
+
+   Returns the number of wchar_t characters copied (excluding a
+   possibly trailing 0-termination character) or -1 in case of an
+   error. */
+
+PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
+    PyObject *unicode,          /* Unicode object */
+    wchar_t *w,                 /* wchar_t buffer */
+    Py_ssize_t size             /* size of buffer */
+    );
+
+/* Convert the Unicode object to a wide character string. The output string
+   always ends with a nul character. If size is not NULL, write the number of
+   wide characters (excluding the null character) into *size.
+
+   Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
+   on success. On error, returns NULL, *size is undefined and raises a
+   MemoryError. */
+
+PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
+    PyObject *unicode,          /* Unicode object */
+    Py_ssize_t *size            /* number of characters of the result */
+    );
+
+#endif
+
+/* --- Unicode ordinals --------------------------------------------------- */
+
+/* Create a Unicode Object from the given Unicode code point ordinal.
+
+   The ordinal must be in range(0x110000). A ValueError is
+   raised in case it is not.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
+
+/* === Builtin Codecs =====================================================
+
+   Many of these APIs take two arguments encoding and errors. These
+   parameters encoding and errors have the same semantics as the ones
+   of the builtin str() API.
+
+   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
+
+   Error handling is set by errors which may also be set to NULL
+   meaning to use the default handling defined for the codec. Default
+   error handling for all builtin codecs is "strict" (ValueErrors are
+   raised).
+
+   The codecs all use a similar interface. Only deviation from the
+   generic ones are documented.
+
+*/
+
+/* --- Manage the default encoding ---------------------------------------- */
+
+/* Returns "utf-8".  */
+PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
+
+/* --- Generic Codecs ----------------------------------------------------- */
+
+/* Create a Unicode object by decoding the encoded string s of the
+   given size. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Decode(
+    const char *s,              /* encoded string */
+    Py_ssize_t size,            /* size of buffer */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Decode a Unicode object unicode and return the result as Python
+   object.
+
+   This API is DEPRECATED. The only supported standard encoding is rot13.
+   Use PyCodec_Decode() to decode with rot13 and non-standard codecs
+   that decode from str. */
+
+Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
+    PyObject *unicode,          /* Unicode object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Decode a Unicode object unicode and return the result as Unicode
+   object.
+
+   This API is DEPRECATED. The only supported standard encoding is rot13.
+   Use PyCodec_Decode() to decode with rot13 and non-standard codecs
+   that decode from str to str. */
+
+Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
+    PyObject *unicode,          /* Unicode object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Encodes a Unicode object and returns the result as Python
+   object.
+
+   This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
+   since all standard encodings (except rot13) encode str to bytes.
+   Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
+   that encode form str to non-bytes. */
+
+Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
+    PyObject *unicode,          /* Unicode object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Encodes a Unicode object and returns the result as Python string
+   object. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
+    PyObject *unicode,          /* Unicode object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Encodes a Unicode object and returns the result as Unicode
+   object.
+
+   This API is DEPRECATED.  The only supported standard encodings is rot13.
+   Use PyCodec_Encode() to encode with rot13 and non-standard codecs
+   that encode from str to str. */
+
+Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
+    PyObject *unicode,          /* Unicode object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Build an encoding map. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
+    PyObject* string            /* 256 character map */
+   );
+
+/* --- UTF-7 Codecs ------------------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
+    const char *string,         /* UTF-7 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
+    const char *string,         /* UTF-7 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
+/* --- UTF-8 Codecs ------------------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
+    const char *string,         /* UTF-8 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
+    const char *string,         /* UTF-8 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* Returns a pointer to the default encoding (UTF-8) of the
+   Unicode object unicode and the size of the encoded representation
+   in bytes stored in *size.
+
+   In case of an error, no *size is set.
+
+   This function caches the UTF-8 encoded string in the unicodeobject
+   and subsequent calls will return the same string.  The memory is released
+   when the unicodeobject is deallocated.
+*/
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
+    PyObject *unicode,
+    Py_ssize_t *size);
+#endif
+
+/* --- UTF-32 Codecs ------------------------------------------------------ */
+
+/* Decodes length bytes from a UTF-32 encoded buffer string and returns
+   the corresponding Unicode object.
+
+   errors (if non-NULL) defines the error handling. It defaults
+   to "strict".
+
+   If byteorder is non-NULL, the decoder starts decoding using the
+   given byte order:
+
+    *byteorder == -1: little endian
+    *byteorder == 0:  native order
+    *byteorder == 1:  big endian
+
+   In native mode, the first four bytes of the stream are checked for a
+   BOM mark. If found, the BOM mark is analysed, the byte order
+   adjusted and the BOM skipped.  In the other modes, no BOM mark
+   interpretation is done. After completion, *byteorder is set to the
+   current byte order at the end of input data.
+
+   If byteorder is NULL, the codec starts in native order mode.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
+    const char *string,         /* UTF-32 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    int *byteorder              /* pointer to byteorder to use
+                                   0=native;-1=LE,1=BE; updated on
+                                   exit */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
+    const char *string,         /* UTF-32 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    int *byteorder,             /* pointer to byteorder to use
+                                   0=native;-1=LE,1=BE; updated on
+                                   exit */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
+/* Returns a Python string using the UTF-32 encoding in native byte
+   order. The string always starts with a BOM mark.  */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* Returns a Python string object holding the UTF-32 encoded value of
+   the Unicode data.
+
+   If byteorder is not 0, output is written according to the following
+   byte order:
+
+   byteorder == -1: little endian
+   byteorder == 0:  native byte order (writes a BOM mark)
+   byteorder == 1:  big endian
+
+   If byteorder is 0, the output string will always start with the
+   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
+   prepended.
+
+*/
+
+/* --- UTF-16 Codecs ------------------------------------------------------ */
+
+/* Decodes length bytes from a UTF-16 encoded buffer string and returns
+   the corresponding Unicode object.
+
+   errors (if non-NULL) defines the error handling. It defaults
+   to "strict".
+
+   If byteorder is non-NULL, the decoder starts decoding using the
+   given byte order:
+
+    *byteorder == -1: little endian
+    *byteorder == 0:  native order
+    *byteorder == 1:  big endian
+
+   In native mode, the first two bytes of the stream are checked for a
+   BOM mark. If found, the BOM mark is analysed, the byte order
+   adjusted and the BOM skipped.  In the other modes, no BOM mark
+   interpretation is done. After completion, *byteorder is set to the
+   current byte order at the end of input data.
+
+   If byteorder is NULL, the codec starts in native order mode.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
+    const char *string,         /* UTF-16 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    int *byteorder              /* pointer to byteorder to use
+                                   0=native;-1=LE,1=BE; updated on
+                                   exit */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
+    const char *string,         /* UTF-16 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    int *byteorder,             /* pointer to byteorder to use
+                                   0=native;-1=LE,1=BE; updated on
+                                   exit */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
+/* Returns a Python string using the UTF-16 encoding in native byte
+   order. The string always starts with a BOM mark.  */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- Unicode-Escape Codecs ---------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
+    const char *string,         /* Unicode-Escape encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
+    const char *string,         /* Raw-Unicode-Escape encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- Latin-1 Codecs -----------------------------------------------------
+
+   Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
+    const char *string,         /* Latin-1 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- ASCII Codecs -------------------------------------------------------
+
+   Only 7-bit ASCII data is expected. All other codes generate errors.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
+    const char *string,         /* ASCII encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- Character Map Codecs -----------------------------------------------
+
+   This codec uses mappings to encode and decode characters.
+
+   Decoding mappings must map byte ordinals (integers in the range from 0 to
+   255) to Unicode strings, integers (which are then interpreted as Unicode
+   ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
+   as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
+   mapping" and cause an error.
+
+   Encoding mappings must map Unicode ordinal integers to bytes objects,
+   integers in the range from 0 to 255 or None.  Unmapped character
+   ordinals (ones which cause a LookupError) as well as mapped to
+   None are treated as "undefined mapping" and cause an error.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
+    const char *string,         /* Encoded string */
+    Py_ssize_t length,          /* size of string */
+    PyObject *mapping,          /* decoding mapping */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
+    PyObject *unicode,          /* Unicode object */
+    PyObject *mapping           /* encoding mapping */
+    );
+
+/* --- MBCS codecs for Windows -------------------------------------------- */
+
+#ifdef MS_WINDOWS
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
+    const char *string,         /* MBCS encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
+    const char *string,         /* MBCS encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
+    int code_page,              /* code page number */
+    const char *string,         /* encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+#endif
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
+    PyObject *unicode           /* Unicode object */
+    );
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
+    int code_page,              /* code page number */
+    PyObject *unicode,          /* Unicode object */
+    const char *errors          /* error handling */
+    );
+#endif
+
+#endif /* MS_WINDOWS */
+
+/* --- Locale encoding --------------------------------------------------- */
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+/* Decode a string from the current locale encoding. The decoder is strict if
+   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
+   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
+   be decoded as a surrogate character and *surrogateescape* is not equal to
+   zero, the byte sequence is escaped using the 'surrogateescape' error handler
+   instead of being decoded. *str* must end with a null character but cannot
+   contain embedded null characters. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
+    const char *str,
+    Py_ssize_t len,
+    const char *errors);
+
+/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
+   length using strlen(). */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
+    const char *str,
+    const char *errors);
+
+/* Encode a Unicode object to the current locale encoding. The encoder is
+   strict is *surrogateescape* is equal to zero, otherwise the
+   "surrogateescape" error handler is used. Return a bytes object. The string
+   cannot contain embedded null characters. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
+    PyObject *unicode,
+    const char *errors
+    );
+#endif
+
+/* --- File system encoding ---------------------------------------------- */
+
+/* ParseTuple converter: encode str objects to bytes using
+   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
+
+PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
+
+/* ParseTuple converter: decode bytes objects to unicode using
+   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
+
+PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
+
+/* Decode a null-terminated string from the Python filesystem encoding
+   and error handler.
+
+   If the string length is known, use PyUnicode_DecodeFSDefaultAndSize(). */
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
+    const char *s               /* encoded string */
+    );
+
+/* Decode a string from the Python filesystem encoding and error handler. */
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
+    const char *s,               /* encoded string */
+    Py_ssize_t size              /* size */
+    );
+
+/* Encode a Unicode object to the Python filesystem encoding and error handler.
+   Return bytes. */
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
+    PyObject *unicode
+    );
+
+/* --- Methods & Slots ----------------------------------------------------
+
+   These are capable of handling Unicode objects and strings on input
+   (we refer to them as strings in the descriptions) and return
+   Unicode objects or integers as appropriate. */
+
+/* Concat two strings giving a new Unicode string. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Concat(
+    PyObject *left,             /* Left string */
+    PyObject *right             /* Right string */
+    );
+
+/* Concat two strings and put the result in *pleft
+   (sets *pleft to NULL on error) */
+
+PyAPI_FUNC(void) PyUnicode_Append(
+    PyObject **pleft,           /* Pointer to left string */
+    PyObject *right             /* Right string */
+    );
+
+/* Concat two strings, put the result in *pleft and drop the right object
+   (sets *pleft to NULL on error) */
+
+PyAPI_FUNC(void) PyUnicode_AppendAndDel(
+    PyObject **pleft,           /* Pointer to left string */
+    PyObject *right             /* Right string */
+    );
+
+/* Split a string giving a list of Unicode strings.
+
+   If sep is NULL, splitting will be done at all whitespace
+   substrings. Otherwise, splits occur at the given separator.
+
+   At most maxsplit splits will be done. If negative, no limit is set.
+
+   Separators are not included in the resulting list.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_Split(
+    PyObject *s,                /* String to split */
+    PyObject *sep,              /* String separator */
+    Py_ssize_t maxsplit         /* Maxsplit count */
+    );
+
+/* Dito, but split at line breaks.
+
+   CRLF is considered to be one line break. Line breaks are not
+   included in the resulting list. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
+    PyObject *s,                /* String to split */
+    int keepends                /* If true, line end markers are included */
+    );
+
+/* Partition a string using a given separator. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Partition(
+    PyObject *s,                /* String to partition */
+    PyObject *sep               /* String separator */
+    );
+
+/* Partition a string using a given separator, searching from the end of the
+   string. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
+    PyObject *s,                /* String to partition */
+    PyObject *sep               /* String separator */
+    );
+
+/* Split a string giving a list of Unicode strings.
+
+   If sep is NULL, splitting will be done at all whitespace
+   substrings. Otherwise, splits occur at the given separator.
+
+   At most maxsplit splits will be done. But unlike PyUnicode_Split
+   PyUnicode_RSplit splits from the end of the string. If negative,
+   no limit is set.
+
+   Separators are not included in the resulting list.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
+    PyObject *s,                /* String to split */
+    PyObject *sep,              /* String separator */
+    Py_ssize_t maxsplit         /* Maxsplit count */
+    );
+
+/* Translate a string by applying a character mapping table to it and
+   return the resulting Unicode object.
+
+   The mapping table must map Unicode ordinal integers to Unicode strings,
+   Unicode ordinal integers or None (causing deletion of the character).
+
+   Mapping tables may be dictionaries or sequences. Unmapped character
+   ordinals (ones which cause a LookupError) are left untouched and
+   are copied as-is.
+
+*/
+
+PyAPI_FUNC(PyObject *) PyUnicode_Translate(
+    PyObject *str,              /* String */
+    PyObject *table,            /* Translate table */
+    const char *errors          /* error handling */
+    );
+
+/* Join a sequence of strings using the given separator and return
+   the resulting Unicode string. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Join(
+    PyObject *separator,        /* Separator string */
+    PyObject *seq               /* Sequence object */
+    );
+
+/* Return 1 if substr matches str[start:end] at the given tail end, 0
+   otherwise. */
+
+PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
+    PyObject *str,              /* String */
+    PyObject *substr,           /* Prefix or Suffix string */
+    Py_ssize_t start,           /* Start index */
+    Py_ssize_t end,             /* Stop index */
+    int direction               /* Tail end: -1 prefix, +1 suffix */
+    );
+
+/* Return the first position of substr in str[start:end] using the
+   given search direction or -1 if not found. -2 is returned in case
+   an error occurred and an exception is set. */
+
+PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
+    PyObject *str,              /* String */
+    PyObject *substr,           /* Substring to find */
+    Py_ssize_t start,           /* Start index */
+    Py_ssize_t end,             /* Stop index */
+    int direction               /* Find direction: +1 forward, -1 backward */
+    );
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+/* Like PyUnicode_Find, but search for single character only. */
+PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
+    PyObject *str,
+    Py_UCS4 ch,
+    Py_ssize_t start,
+    Py_ssize_t end,
+    int direction
+    );
+#endif
+
+/* Count the number of occurrences of substr in str[start:end]. */
+
+PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
+    PyObject *str,              /* String */
+    PyObject *substr,           /* Substring to count */
+    Py_ssize_t start,           /* Start index */
+    Py_ssize_t end              /* Stop index */
+    );
+
+/* Replace at most maxcount occurrences of substr in str with replstr
+   and return the resulting Unicode object. */
+
+PyAPI_FUNC(PyObject *) PyUnicode_Replace(
+    PyObject *str,              /* String */
+    PyObject *substr,           /* Substring to find */
+    PyObject *replstr,          /* Substring to replace */
+    Py_ssize_t maxcount         /* Max. number of replacements to apply;
+                                   -1 = all */
+    );
+
+/* Compare two strings and return -1, 0, 1 for less than, equal,
+   greater than resp.
+   Raise an exception and return -1 on error. */
+
+PyAPI_FUNC(int) PyUnicode_Compare(
+    PyObject *left,             /* Left string */
+    PyObject *right             /* Right string */
+    );
+
+/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
+   equal, and greater than, respectively.  It is best to pass only
+   ASCII-encoded strings, but the function interprets the input string as
+   ISO-8859-1 if it contains non-ASCII characters.
+   This function does not raise exceptions. */
+
+PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
+    PyObject *left,
+    const char *right           /* ASCII-encoded string */
+    );
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
+/* Compare a Unicode object with UTF-8 encoded C string.
+   Return 1 if they are equal, or 0 otherwise.
+   This function does not raise exceptions. */
+
+PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *);
+PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t);
+#endif
+
+/* Rich compare two strings and return one of the following:
+
+   - NULL in case an exception was raised
+   - Py_True or Py_False for successful comparisons
+   - Py_NotImplemented in case the type combination is unknown
+
+   Possible values for op:
+
+     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
+
+*/
+
+PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
+    PyObject *left,             /* Left string */
+    PyObject *right,            /* Right string */
+    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
+    );
+
+/* Apply an argument tuple or dictionary to a format string and return
+   the resulting Unicode string. */
+
+PyAPI_FUNC(PyObject *) PyUnicode_Format(
+    PyObject *format,           /* Format string */
+    PyObject *args              /* Argument tuple or dictionary */
+    );
+
+/* Checks whether element is contained in container and return 1/0
+   accordingly.
+
+   element has to coerce to a one element Unicode string. -1 is
+   returned in case of an error. */
+
+PyAPI_FUNC(int) PyUnicode_Contains(
+    PyObject *container,        /* Container string */
+    PyObject *element           /* Element string */
+    );
+
+/* Checks whether argument is a valid identifier. */
+
+PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
+
+/* === Characters Type APIs =============================================== */
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_UNICODEOBJECT_H
+#  include "cpython/unicodeobject.h"
+#  undef Py_CPYTHON_UNICODEOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_UNICODEOBJECT_H */
diff --git a/Include/warnings.h b/Include/warnings.h
new file mode 100644
index 0000000000000000000000000000000000000000..18ac1543a3ca9e1d5a100badc221a14d61a1d58f
--- /dev/null
+++ b/Include/warnings.h
@@ -0,0 +1,45 @@
+#ifndef Py_WARNINGS_H
+#define Py_WARNINGS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(int) PyErr_WarnEx(
+    PyObject *category,
+    const char *message,        /* UTF-8 encoded string */
+    Py_ssize_t stack_level);
+
+PyAPI_FUNC(int) PyErr_WarnFormat(
+    PyObject *category,
+    Py_ssize_t stack_level,
+    const char *format,         /* ASCII-encoded string  */
+    ...);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+/* Emit a ResourceWarning warning */
+PyAPI_FUNC(int) PyErr_ResourceWarning(
+    PyObject *source,
+    Py_ssize_t stack_level,
+    const char *format,         /* ASCII-encoded string  */
+    ...);
+#endif
+
+PyAPI_FUNC(int) PyErr_WarnExplicit(
+    PyObject *category,
+    const char *message,        /* UTF-8 encoded string */
+    const char *filename,       /* decoded from the filesystem encoding */
+    int lineno,
+    const char *module,         /* UTF-8 encoded string */
+    PyObject *registry);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_WARNINGS_H
+#  include "cpython/warnings.h"
+#  undef Py_CPYTHON_WARNINGS_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_WARNINGS_H */
+
diff --git a/Include/weakrefobject.h b/Include/weakrefobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6e71eb178b124b236b0668ddda4fc27f5b64a7b
--- /dev/null
+++ b/Include/weakrefobject.h
@@ -0,0 +1,46 @@
+/* Weak references objects for Python. */
+
+#ifndef Py_WEAKREFOBJECT_H
+#define Py_WEAKREFOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _PyWeakReference PyWeakReference;
+
+PyAPI_DATA(PyTypeObject) _PyWeakref_RefType;
+PyAPI_DATA(PyTypeObject) _PyWeakref_ProxyType;
+PyAPI_DATA(PyTypeObject) _PyWeakref_CallableProxyType;
+
+#define PyWeakref_CheckRef(op) PyObject_TypeCheck((op), &_PyWeakref_RefType)
+#define PyWeakref_CheckRefExact(op) \
+        Py_IS_TYPE((op), &_PyWeakref_RefType)
+#define PyWeakref_CheckProxy(op) \
+        (Py_IS_TYPE((op), &_PyWeakref_ProxyType) \
+         || Py_IS_TYPE((op), &_PyWeakref_CallableProxyType))
+
+#define PyWeakref_Check(op) \
+        (PyWeakref_CheckRef(op) || PyWeakref_CheckProxy(op))
+
+
+PyAPI_FUNC(PyObject *) PyWeakref_NewRef(PyObject *ob,
+                                        PyObject *callback);
+PyAPI_FUNC(PyObject *) PyWeakref_NewProxy(PyObject *ob,
+                                          PyObject *callback);
+Py_DEPRECATED(3.13) PyAPI_FUNC(PyObject *) PyWeakref_GetObject(PyObject *ref);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
+PyAPI_FUNC(int) PyWeakref_GetRef(PyObject *ref, PyObject **pobj);
+#endif
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_WEAKREFOBJECT_H
+#  include "cpython/weakrefobject.h"
+#  undef Py_CPYTHON_WEAKREFOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_WEAKREFOBJECT_H */
diff --git a/reward_curve.png b/reward_curve.png
new file mode 100644
index 0000000000000000000000000000000000000000..9aa011729699ed80354ac83ed85f98dfb819684d
Binary files /dev/null and b/reward_curve.png differ
diff --git a/src/rl/train_grpo.py b/src/rl/train_grpo.py
index d2bddd4b23b95b5183e5745f78400d968da68d3d..82be3f1ad9490c13c451cafda019b2b5748ee6ef 100644
--- a/src/rl/train_grpo.py
+++ b/src/rl/train_grpo.py
@@ -29,10 +29,31 @@ def openenv_reward_func(prompts, completions, **kwargs):
     benign_samples = batch["benign"]
     env.reset(adv_samples, benign_samples)
 
+    if random.random() < 0.05:
+        logging.info(f"--- Sample Prompt ---\n{prompts[0]}\n---------------------")
+        logging.info(f"--- Sample Completion ---\n{completions[0][:200]}...\n-------------------------")
+
     for comp in completions:
+        # Extract string if comp is a ChatML message list (e.g. [{"role": "assistant", "content": "..."}])
+        if isinstance(comp, list):
+            if len(comp) > 0 and isinstance(comp[-1], dict) and "content" in comp[-1]:
+                comp_text = comp[-1]["content"]
+            else:
+                comp_text = str(comp)
+        else:
+            comp_text = str(comp)
+
+        partial_reward = 0.0
+        if '{' in comp_text:
+            partial_reward += 0.5
         try:
-            clean_json = extract_and_clean_json(comp)
+            clean_json = extract_and_clean_json(comp_text)
             parsed_ast = json.loads(clean_json)
+            partial_reward += 1.0 # Valid JSON syntax
+            
+            if 'root' in parsed_ast or 'operator' in parsed_ast:
+                partial_reward += 2.0 # Has basic AST structure
+
             # Validate AST
             ast_wrapper = GuardrailGraph.model_validate(parsed_ast)
             
@@ -40,10 +61,13 @@ def openenv_reward_func(prompts, completions, **kwargs):
             action = Action(ast_json=clean_json)
             recall, fpr, syntax_error = env.step(action)
             r = r_engine.calculate(recall, fpr, syntax_error)
-            rewards.append(r)
-        except (json.JSONDecodeError, ValidationError):
-            # Massive negative reward for syntax errors
-            rewards.append(-10.0)
+            rewards.append(r + partial_reward)
+        except json.JSONDecodeError:
+            # Massive negative reward for syntax errors, but add partial
+            rewards.append(-10.0 + partial_reward)
+        except ValidationError:
+            # Valid JSON but invalid schema
+            rewards.append(-5.0 + partial_reward)
             
     return rewards
 
@@ -80,9 +104,10 @@ def train():
     dataset_cache.max_size = 10000
     dataset_cache.ingest_dummy_data() # Which actually pulls the real HF datasets now
     
-    # Create simple dataset of prompts to trigger JSON AST generation
-    # Provide enough prompts for 250 steps with batch_size=4 and grad_accum=4 (4000 total)
-    prompts_data = [{"prompt": "You are a cyber security expert. Generate a JSON AST GuardrailGraph to block prompt injections but allow benign queries. Output strictly in ```json format."}] * 5000
+    # Create simple dataset of prompts to trigger JSON AST generation using ChatML
+    system_prompt = "You are an autonomous Blue-Team engineer. Generate a highly constrained, Pydantic-validated JSON Guardrail Logic Graph to block prompt injections but allow benign queries. Output ONLY valid JSON inside ```json ... ``` blocks. Do not include conversational filler."
+    user_prompt = "Analyze the threat vectors and synthesize the GuardrailGraph now."
+    prompts_data = [{"prompt": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]}] * 5000
     train_dataset = Dataset.from_list(prompts_data)
     
     training_args = GRPOConfig(
@@ -90,7 +115,10 @@ def train():
         learning_rate=1e-5,
         per_device_train_batch_size=4, # Pushing 8GB VRAM to 95% util
         gradient_accumulation_steps=4, # Effective batch size 16
+        num_generations=4,             # Fix: Reduce from 8 to 4 to prevent OOM / Shared Memory Swapping
         max_steps=250,                 # 30-45 mins on RTX 4070
+        max_completion_length=1024,    # Fix: Prevent 256 token cutoff
+        max_prompt_length=512,
         logging_steps=1,
         save_steps=50,
         bf16=is_bfloat16_supported(),
diff --git a/unsloth_compiled_cache/UnslothAlignPropTrainer.py b/unsloth_compiled_cache/UnslothAlignPropTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e1cfddffc3bfdf38c56fe4983cb53c28e00a13
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothAlignPropTrainer.py
@@ -0,0 +1,848 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.alignprop_trainer import (Accelerator, AlignPropConfig, AlignPropTrainer, Any, Callable, DDPOStableDiffusionPipeline, Optional, Path, ProjectConfiguration, PyTorchModelHubMixin, Union, defaultdict, generate_model_card, get_comet_experiment_url, is_wandb_available, logger, logging, os, set_seed, textwrap, torch, warnings)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothAlignPropConfig(AlignPropConfig):
+    """
+    
+Configuration class for the [`AlignPropTrainer`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`):
+        Name of this experiment (defaults to the file name without the extension).
+    run_name (`str`, *optional*, defaults to `""`):
+        Name of this run.
+    seed (`int`, *optional*, defaults to `0`):
+        Random seed for reproducibility.
+    log_with (`str` or `None`, *optional*, defaults to `None`):
+        Log with either `"wandb"` or `"tensorboard"`. Check
+        [tracking](https://huggingface.co/docs/accelerate/usage_guides/tracking) for more details.
+    log_image_freq (`int`, *optional*, defaults to `1`):
+        Frequency for logging images.
+    tracker_kwargs (`dict[str, Any]`, *optional*, defaults to `{}`):
+        Keyword arguments for the tracker (e.g., `wandb_project`).
+    accelerator_kwargs (`dict[str, Any]`, *optional*, defaults to `{}`):
+        Keyword arguments for the accelerator.
+    project_kwargs (`dict[str, Any]`, *optional*, defaults to `{}`):
+        Keyword arguments for the accelerator project config (e.g., `logging_dir`).
+    tracker_project_name (`str`, *optional*, defaults to `"trl"`):
+        Name of project to use for tracking.
+    logdir (`str`, *optional*, defaults to `"logs"`):
+        Top-level logging directory for checkpoint saving.
+    num_epochs (`int`, *optional*, defaults to `100`):
+        Number of epochs to train.
+    save_freq (`int`, *optional*, defaults to `1`):
+        Number of epochs between saving model checkpoints.
+    num_checkpoint_limit (`int`, *optional*, defaults to `5`):
+        Number of checkpoints to keep before overwriting old ones.
+    mixed_precision (`str`, *optional*, defaults to `"fp16"`):
+        Mixed precision training.
+    allow_tf32 (`bool`, *optional*, defaults to `True`):
+        Allow `tf32` on Ampere GPUs.
+    resume_from (`str`, *optional*, defaults to `""`):
+        Path to resume training from a checkpoint.
+    sample_num_steps (`int`, *optional*, defaults to `50`):
+        Number of sampler inference steps.
+    sample_eta (`float`, *optional*, defaults to `1.0`):
+        Eta parameter for the DDIM sampler.
+    sample_guidance_scale (`float`, *optional*, defaults to `5.0`):
+        Classifier-free guidance weight.
+    train_batch_size (`int`, *optional*, defaults to `1`):
+        Batch size for training.
+    train_use_8bit_adam (`bool`, *optional*, defaults to `False`):
+        Whether to use the 8bit Adam optimizer from `bitsandbytes`.
+    train_learning_rate (`float`, *optional*, defaults to `1e-3`):
+        Learning rate.
+    train_adam_beta1 (`float`, *optional*, defaults to `0.9`):
+        Beta1 for Adam optimizer.
+    train_adam_beta2 (`float`, *optional*, defaults to `0.999`):
+        Beta2 for Adam optimizer.
+    train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`):
+        Weight decay for Adam optimizer.
+    train_adam_epsilon (`float`, *optional*, defaults to `1e-8`):
+        Epsilon value for Adam optimizer.
+    train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`):
+        Number of gradient accumulation steps.
+    train_max_grad_norm (`float`, *optional*, defaults to `1.0`):
+        Maximum gradient norm for gradient clipping.
+    negative_prompts (`str` or `None`, *optional*, defaults to `None`):
+        Comma-separated list of prompts to use as negative examples.
+    truncated_backprop_rand (`bool`, *optional*, defaults to `True`):
+        If `True`, randomized truncation to different diffusion timesteps is used.
+    truncated_backprop_timestep (`int`, *optional*, defaults to `49`):
+        Absolute timestep to which the gradients are backpropagated. Used only if `truncated_backprop_rand=False`.
+    truncated_rand_backprop_minmax (`tuple[int, int]`, *optional*, defaults to `(0, 50)`):
+        Range of diffusion timesteps for randomized truncated backpropagation.
+    push_to_hub (`bool`, *optional*, defaults to `False`):
+        Whether to push the final model to the Hub.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    
+    def __init__(
+        self,
+        exp_name = 'train_grpo',
+        run_name = '',
+        seed = 3407,
+        log_with = None,
+        log_image_freq = 1,
+        tracker_project_name = 'trl',
+        logdir = 'logs',
+        num_epochs = 100,
+        save_freq = 1,
+        num_checkpoint_limit = 5,
+        mixed_precision = 'fp16',
+        allow_tf32 = True,
+        resume_from = '',
+        sample_num_steps = 50,
+        sample_eta = 1.0,
+        sample_guidance_scale = 5.0,
+        train_batch_size = 1,
+        train_use_8bit_adam = False,
+        train_learning_rate = 5e-05,
+        train_adam_beta1 = 0.9,
+        train_adam_beta2 = 0.999,
+        train_adam_weight_decay = 0.01,
+        train_adam_epsilon = 1e-08,
+        train_gradient_accumulation_steps = 2,
+        train_max_grad_norm = 1.0,
+        negative_prompts = None,
+        truncated_backprop_rand = True,
+        truncated_backprop_timestep = 49,
+        push_to_hub = False,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        
+        super().__init__(
+            exp_name = exp_name,
+            run_name = run_name,
+            seed = seed,
+            log_with = log_with,
+            log_image_freq = log_image_freq,
+            tracker_project_name = tracker_project_name,
+            logdir = logdir,
+            num_epochs = num_epochs,
+            save_freq = save_freq,
+            num_checkpoint_limit = num_checkpoint_limit,
+            mixed_precision = mixed_precision,
+            allow_tf32 = allow_tf32,
+            resume_from = resume_from,
+            sample_num_steps = sample_num_steps,
+            sample_eta = sample_eta,
+            sample_guidance_scale = sample_guidance_scale,
+            train_batch_size = train_batch_size,
+            train_use_8bit_adam = train_use_8bit_adam,
+            train_learning_rate = train_learning_rate,
+            train_adam_beta1 = train_adam_beta1,
+            train_adam_beta2 = train_adam_beta2,
+            train_adam_weight_decay = train_adam_weight_decay,
+            train_adam_epsilon = train_adam_epsilon,
+            train_gradient_accumulation_steps = train_gradient_accumulation_steps,
+            train_max_grad_norm = train_max_grad_norm,
+            negative_prompts = negative_prompts,
+            truncated_backprop_rand = truncated_backprop_rand,
+            truncated_backprop_timestep = truncated_backprop_timestep,
+            push_to_hub = push_to_hub,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        
+pass
+
+class _UnslothAlignPropTrainer(PyTorchModelHubMixin):
+    """
+    The AlignPropTrainer uses Deep Diffusion Policy Optimization to optimise diffusion models. Note, this trainer is
+    heavily inspired by the work here: https://github.com/mihirp1998/AlignProp/ As of now only Stable Diffusion based
+    pipelines are supported
+
+    Args:
+        config (`AlignPropConfig`):
+            Configuration object for AlignPropTrainer. Check the documentation of `PPOConfig` for more details.
+        reward_function (`Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor]`):
+            Reward function to be used
+        prompt_function (`Callable[[], tuple[str, Any]]`):
+            Function to generate prompts to guide model
+        sd_pipeline (`DDPOStableDiffusionPipeline`):
+            Stable Diffusion pipeline to be used for training.
+        image_samples_hook (`Optional[Callable[[Any, Any, Any], Any]]`):
+            Hook to be called to log images
+    """
+
+    _tag_names = ["trl", "alignprop"]
+
+    def __init__(
+        self,
+        config: AlignPropConfig,
+        reward_function: Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor],
+        prompt_function: Callable[[], tuple[str, Any]],
+        sd_pipeline: DDPOStableDiffusionPipeline,
+        image_samples_hook: Optional[Callable[[Any, Any, Any], Any]] = None,
+    ):
+        warnings.warn(
+            "AlignPropTrainer is deprecated and will be removed in version 0.23.0.",
+            DeprecationWarning,
+        )
+        if image_samples_hook is None:
+            logger.warning("No image_samples_hook provided; no images will be logged")
+
+        self.prompt_fn = prompt_function
+        self.reward_fn = reward_function
+        self.config = config
+        self.image_samples_callback = image_samples_hook
+
+        accelerator_project_config = ProjectConfiguration(**self.config.project_kwargs)
+
+        if self.config.resume_from:
+            self.config.resume_from = os.path.normpath(os.path.expanduser(self.config.resume_from))
+            if "checkpoint_" not in os.path.basename(self.config.resume_from):
+                # get the most recent checkpoint in this directory
+                checkpoints = list(
+                    filter(
+                        lambda x: "checkpoint_" in x,
+                        os.listdir(self.config.resume_from),
+                    )
+                )
+                if len(checkpoints) == 0:
+                    raise ValueError(f"No checkpoints found in {self.config.resume_from}")
+                checkpoint_numbers = sorted([int(x.split("_")[-1]) for x in checkpoints])
+                self.config.resume_from = os.path.join(
+                    self.config.resume_from,
+                    f"checkpoint_{checkpoint_numbers[-1]}",
+                )
+
+                accelerator_project_config.iteration = checkpoint_numbers[-1] + 1
+
+        self.accelerator = Accelerator(
+            log_with=self.config.log_with,
+            mixed_precision=self.config.mixed_precision,
+            project_config=accelerator_project_config,
+            # we always accumulate gradients across timesteps; we want config.train.gradient_accumulation_steps to be the
+            # number of *samples* we accumulate across, so we need to multiply by the number of training timesteps to get
+            # the total number of optimizer steps to accumulate across.
+            gradient_accumulation_steps=self.config.train_gradient_accumulation_steps,
+            **self.config.accelerator_kwargs,
+        )
+
+        is_using_tensorboard = config.log_with is not None and config.log_with == "tensorboard"
+
+        if self.accelerator.is_main_process:
+            self.accelerator.init_trackers(
+                self.config.tracker_project_name,
+                config=dict(alignprop_trainer_config=config.to_dict())
+                if not is_using_tensorboard
+                else config.to_dict(),
+                init_kwargs=self.config.tracker_kwargs,
+            )
+
+        logger.info(f"\n{config}")
+
+        set_seed(self.config.seed, device_specific=True)
+
+        self.sd_pipeline = sd_pipeline
+
+        self.sd_pipeline.set_progress_bar_config(
+            position=1,
+            disable=not self.accelerator.is_local_main_process,
+            leave=False,
+            desc="Timestep",
+            dynamic_ncols=True,
+        )
+
+        # For mixed precision training we cast all non-trainable weights [vae, non-lora text_encoder and non-lora unet] to half-precision
+        # as these weights are only used for inference, keeping weights in full precision is not required.
+        if self.accelerator.mixed_precision == "fp16":
+            inference_dtype = torch.float16
+        elif self.accelerator.mixed_precision == "bf16":
+            inference_dtype = torch.bfloat16
+        else:
+            inference_dtype = torch.float32
+
+        self.sd_pipeline.vae.to(self.accelerator.device, dtype=inference_dtype)
+        self.sd_pipeline.text_encoder.to(self.accelerator.device, dtype=inference_dtype)
+        self.sd_pipeline.unet.to(self.accelerator.device, dtype=inference_dtype)
+
+        trainable_layers = self.sd_pipeline.get_trainable_layers()
+
+        self.accelerator.register_save_state_pre_hook(self._save_model_hook)
+        self.accelerator.register_load_state_pre_hook(self._load_model_hook)
+
+        # Enable TF32 for faster training on Ampere GPUs,
+        # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+        if self.config.allow_tf32 and torch.cuda.is_available():
+            torch.backends.cuda.matmul.allow_tf32 = True
+
+        self.optimizer = self._setup_optimizer(
+            trainable_layers.parameters() if not isinstance(trainable_layers, list) else trainable_layers
+        )
+
+        self.neg_prompt_embed = self.sd_pipeline.text_encoder(
+            self.sd_pipeline.tokenizer(
+                [""] if self.config.negative_prompts is None else self.config.negative_prompts,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.sd_pipeline.tokenizer.model_max_length,
+            ).input_ids.to(self.accelerator.device)
+        )[0]
+
+        # NOTE: for some reason, autocast is necessary for non-lora training but for lora training it isn't necessary and it uses
+        # more memory
+        self.autocast = self.sd_pipeline.autocast or self.accelerator.autocast
+
+        if hasattr(self.sd_pipeline, "use_lora") and self.sd_pipeline.use_lora:
+            unet, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer)
+            self.trainable_layers = list(filter(lambda p: p.requires_grad, unet.parameters()))
+        else:
+            self.trainable_layers, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer)
+
+        if config.resume_from:
+            logger.info(f"Resuming from {config.resume_from}")
+            self.accelerator.load_state(config.resume_from)
+            self.first_epoch = int(config.resume_from.split("_")[-1]) + 1
+        else:
+            self.first_epoch = 0
+
+    def compute_rewards(self, prompt_image_pairs):
+        reward, reward_metadata = self.reward_fn(
+            prompt_image_pairs["images"], prompt_image_pairs["prompts"], prompt_image_pairs["prompt_metadata"]
+        )
+        return reward
+
+    def step(self, epoch: int, global_step: int):
+        """
+        Perform a single step of training.
+
+        Args:
+            epoch (int): The current epoch.
+            global_step (int): The current global step.
+
+        Side Effects:
+            - Model weights are updated
+            - Logs the statistics to the accelerator trackers.
+            - If `self.image_samples_callback` is not None, it will be called with the prompt_image_pairs, global_step,
+              and the accelerator tracker.
+
+        Returns:
+            global_step (int): The updated global step.
+        """
+        info = defaultdict(list)
+
+        self.sd_pipeline.unet.train()
+
+        for _ in range(self.config.train_gradient_accumulation_steps):
+            with self.accelerator.accumulate(self.sd_pipeline.unet), self.autocast(), torch.enable_grad():
+                prompt_image_pairs = self._generate_samples(
+                    batch_size=self.config.train_batch_size,
+                )
+
+                rewards = self.compute_rewards(prompt_image_pairs)
+
+                prompt_image_pairs["rewards"] = rewards
+
+                rewards_vis = self.accelerator.gather(rewards).detach().cpu().numpy()
+
+                loss = self.calculate_loss(rewards)
+
+                self.accelerator.backward(loss)
+
+                if self.accelerator.sync_gradients:
+                    self.accelerator.clip_grad_norm_(
+                        self.trainable_layers.parameters()
+                        if not isinstance(self.trainable_layers, list)
+                        else self.trainable_layers,
+                        self.config.train_max_grad_norm,
+                    )
+
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+
+            info["reward_mean"].append(rewards_vis.mean())
+            info["reward_std"].append(rewards_vis.std())
+            info["loss"].append(loss.item())
+
+        # Checks if the accelerator has performed an optimization step behind the scenes
+        if self.accelerator.sync_gradients:
+            # log training-related stuff
+            info = {k: torch.mean(torch.tensor(v)) for k, v in info.items()}
+            info = self.accelerator.reduce(info, reduction="mean")
+            info.update({"epoch": epoch})
+            self.accelerator.log(info, step=global_step)
+            global_step += 1
+            info = defaultdict(list)
+        else:
+            raise ValueError(
+                "Optimization step should have been performed by this point. Please check calculated gradient accumulation settings."
+            )
+        # Logs generated images
+        if self.image_samples_callback is not None and global_step % self.config.log_image_freq == 0:
+            self.image_samples_callback(prompt_image_pairs, global_step, self.accelerator.trackers[0])
+
+        if epoch != 0 and epoch % self.config.save_freq == 0 and self.accelerator.is_main_process:
+            self.accelerator.save_state()
+
+        return global_step
+
+    def calculate_loss(self, rewards):
+        """
+        Calculate the loss for a batch of an unpacked sample
+
+        Args:
+            rewards (torch.Tensor):
+                Differentiable reward scalars for each generated image, shape: [batch_size]
+
+        Returns:
+            loss (torch.Tensor) (all of these are of shape (1,))
+        """
+        #  Loss is specific to Aesthetic Reward function used in AlignProp (https://huggingface.co/papers/2310.03739)
+        loss = 10.0 - (rewards).mean()
+        return loss
+
+    def loss(
+        self,
+        advantages: torch.Tensor,
+        clip_range: float,
+        ratio: torch.Tensor,
+    ):
+        unclipped_loss = -advantages * ratio
+        clipped_loss = -advantages * torch.clamp(
+            ratio,
+            1.0 - clip_range,
+            1.0 + clip_range,
+        )
+        return torch.mean(torch.maximum(unclipped_loss, clipped_loss))
+
+    def _setup_optimizer(self, trainable_layers_parameters):
+        if self.config.train_use_8bit_adam:
+            import bitsandbytes
+
+            optimizer_cls = bitsandbytes.optim.AdamW8bit
+        else:
+            optimizer_cls = torch.optim.AdamW
+
+        return optimizer_cls(
+            trainable_layers_parameters,
+            lr=self.config.train_learning_rate,
+            betas=(self.config.train_adam_beta1, self.config.train_adam_beta2),
+            weight_decay=self.config.train_adam_weight_decay,
+            eps=self.config.train_adam_epsilon,
+        )
+
+    def _save_model_hook(self, models, weights, output_dir):
+        self.sd_pipeline.save_checkpoint(models, weights, output_dir)
+        weights.pop()  # ensures that accelerate doesn't try to handle saving of the model
+
+    def _load_model_hook(self, models, input_dir):
+        self.sd_pipeline.load_checkpoint(models, input_dir)
+        models.pop()  # ensures that accelerate doesn't try to handle loading of the model
+
+    def _generate_samples(self, batch_size, with_grad=True, prompts=None):
+        """
+        Generate samples from the model
+
+        Args:
+            batch_size (int): Batch size to use for sampling
+            with_grad (bool): Whether the generated RGBs should have gradients attached to it.
+            prompts (list[str], *optional*): If provided, use these prompts instead of generating new ones.
+
+        Returns:
+            prompt_image_pairs (dict[Any])
+        """
+        prompt_image_pairs = {}
+
+        sample_neg_prompt_embeds = self.neg_prompt_embed.repeat(batch_size, 1, 1)
+
+        if prompts is None:
+            prompts, prompt_metadata = zip(*[self.prompt_fn() for _ in range(batch_size)])
+        else:
+            prompt_metadata = [{} for _ in range(batch_size)]
+
+        prompt_ids = self.sd_pipeline.tokenizer(
+            prompts,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+            max_length=self.sd_pipeline.tokenizer.model_max_length,
+        ).input_ids.to(self.accelerator.device)
+
+        prompt_embeds = self.sd_pipeline.text_encoder(prompt_ids)[0]
+
+        if with_grad:
+            sd_output = self.sd_pipeline.rgb_with_grad(
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=sample_neg_prompt_embeds,
+                num_inference_steps=self.config.sample_num_steps,
+                guidance_scale=self.config.sample_guidance_scale,
+                eta=self.config.sample_eta,
+                truncated_backprop_rand=self.config.truncated_backprop_rand,
+                truncated_backprop_timestep=self.config.truncated_backprop_timestep,
+                truncated_rand_backprop_minmax=self.config.truncated_rand_backprop_minmax,
+                output_type="pt",
+            )
+        else:
+            sd_output = self.sd_pipeline(
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=sample_neg_prompt_embeds,
+                num_inference_steps=self.config.sample_num_steps,
+                guidance_scale=self.config.sample_guidance_scale,
+                eta=self.config.sample_eta,
+                output_type="pt",
+            )
+
+        images = sd_output.images
+
+        prompt_image_pairs["images"] = images
+        prompt_image_pairs["prompts"] = prompts
+        prompt_image_pairs["prompt_metadata"] = prompt_metadata
+
+        return prompt_image_pairs
+
+    def train(self, epochs: Optional[int] = None):
+        """
+        Train the model for a given number of epochs
+        """
+        global_step = 0
+        if epochs is None:
+            epochs = self.config.num_epochs
+        for epoch in range(self.first_epoch, epochs):
+            global_step = self.step(epoch, global_step)
+
+    def _save_pretrained(self, save_directory):
+        self.sd_pipeline.save_pretrained(save_directory)
+        self.create_model_card()
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @article{prabhudesai2024aligning,
+            title        = {{Aligning Text-to-Image Diffusion Models with Reward Backpropagation}},
+            author       = {Mihir Prabhudesai and Anirudh Goyal and Deepak Pathak and Katerina Fragkiadaki},
+            year         = 2024,
+            eprint       = {arXiv:2310.03739}
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="AlignProp",
+            trainer_citation=citation,
+            paper_title="Aligning Text-to-Image Diffusion Models with Reward Backpropagation",
+            paper_id="2310.03739",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothAlignPropTrainer(_UnslothAlignPropTrainer):
+    """
+    
+The AlignPropTrainer uses Deep Diffusion Policy Optimization to optimise diffusion models. Note, this trainer is
+heavily inspired by the work here: https://github.com/mihirp1998/AlignProp/ As of now only Stable Diffusion based
+pipelines are supported
+
+Args:
+    config (`AlignPropConfig`):
+        Configuration object for AlignPropTrainer. Check the documentation of `PPOConfig` for more details.
+    reward_function (`Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor]`):
+        Reward function to be used
+    prompt_function (`Callable[[], tuple[str, Any]]`):
+        Function to generate prompts to guide model
+    sd_pipeline (`DDPOStableDiffusionPipeline`):
+        Stable Diffusion pipeline to be used for training.
+    image_samples_hook (`Optional[Callable[[Any, Any, Any], Any]]`):
+        Hook to be called to log images
+
+    """
+    def __init__(
+        self,
+        config,
+        reward_function,
+        prompt_function,
+        sd_pipeline,
+        image_samples_hook = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothAlignPropConfig()
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('alignprop_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            config = config,
+            reward_function = reward_function,
+            prompt_function = prompt_function,
+            sd_pipeline = sd_pipeline,
+            image_samples_hook = image_samples_hook,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothBCOTrainer.py b/unsloth_compiled_cache/UnslothBCOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..610c6ecae52a3912eff70ca8049ae9b6b4b8588c
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothBCOTrainer.py
@@ -0,0 +1,2095 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.bco_trainer import (Any, AutoModelForCausalLM, BCOConfig, BCOTrainer, BaseImageProcessor, CLF_NAME, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Literal, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RUNNING_NAME, RunningMoments, SequentialSampler, Trainer, TrainerCallback, TrainingArguments, Union, _process_tokens, _tokenize, autocast, contextmanager, create_reference_model, defaultdict, disable_dropout_in_model, generate_model_card, get_comet_experiment_url, has_length, inspect, is_comet_available, is_joblib_available, is_peft_available, is_sklearn_available, is_wandb_available, itemgetter, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, maybe_unpair_preference_dataset, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_deepspeed, prepare_model_for_kbit_training, random, selective_log_softmax, textwrap, torch, tqdm, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, logger, os, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothBCOConfig(BCOConfig):
+    """
+    
+Configuration class for the [`BCOTrainer`].
+
+This class includes only the parameters that are specific to BCO training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    max_length (`int` or `None`, *optional*, defaults to `1024`):
+        Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+        to use the default data collator.
+    max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+        Maximum length of the prompt. This argument is required if you want to use the default data collator.
+    max_completion_length (`int` or `None`, *optional*, defaults to `None`):
+        Maximum length of the completion. This argument is required if you want to use the default data collator
+        and your model is an encoder-decoder.
+    beta (`float`, *optional*, defaults to `0.1`):
+        Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+        reference model.
+    label_pad_token_id (`int`,  *optional*, defaults to `-100`):
+        Label pad token id. This argument is required if you want to use the default data collator.
+    padding_value (`int` or `None`, *optional*, defaults to `None`):
+        Padding value to use. If `None`, the padding value of the tokenizer is used.
+    truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+        Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+        This argument is required if you want to use the default data collator.
+    disable_dropout (`bool`, *optional*, defaults to `True`):
+        Whether to disable dropout in the model and reference model.
+    generate_during_eval (`bool`, *optional*, defaults to `False`):
+        If `True`, generates and logs completions from both the model and the reference model to W&B or Comet
+        during evaluation.
+    is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`):
+        When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+        you need to specify if the model returned by the callable is an encoder-decoder model.
+    precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
+        Whether to precompute reference model log probabilities for training and evaluation datasets. This is
+        useful when training without the reference model to reduce the total GPU memory needed.
+    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+        string.
+    ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
+        from a string.
+    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        Number of processes to use for processing the dataset.
+    prompt_sample_size (`int`, *optional*, defaults to `1024`):
+        Number of prompts that are fed to density ratio classifier.
+    min_density_ratio (`float`, *optional*, defaults to `0.5`):
+        Minimum value of the density ratio. The estimated density ratio is clamped to this value.
+    max_density_ratio (`float`, *optional*, defaults to `10.0`):
+        Maximum value of the density ratio. The estimated density ratio is clamped to this value.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        max_length = 1024,
+        max_prompt_length = 512,
+        max_completion_length = None,
+        beta = 0.1,
+        label_pad_token_id = -100,
+        padding_value = None,
+        truncation_mode = 'keep_end',
+        disable_dropout = True,
+        generate_during_eval = False,
+        is_encoder_decoder = None,
+        precompute_ref_log_probs = False,
+        model_init_kwargs = None,
+        ref_model_init_kwargs = None,
+        dataset_num_proc = None,
+        prompt_sample_size = 1024,
+        min_density_ratio = 0.5,
+        max_density_ratio = 10.0,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            max_length = max_length,
+            max_prompt_length = max_prompt_length,
+            max_completion_length = max_completion_length,
+            beta = beta,
+            label_pad_token_id = label_pad_token_id,
+            padding_value = padding_value,
+            truncation_mode = truncation_mode,
+            disable_dropout = disable_dropout,
+            generate_during_eval = generate_during_eval,
+            is_encoder_decoder = is_encoder_decoder,
+            precompute_ref_log_probs = precompute_ref_log_probs,
+            model_init_kwargs = model_init_kwargs,
+            ref_model_init_kwargs = ref_model_init_kwargs,
+            dataset_num_proc = dataset_num_proc,
+            prompt_sample_size = prompt_sample_size,
+            min_density_ratio = min_density_ratio,
+            max_density_ratio = max_density_ratio,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothBCOTrainer(Trainer):
+    r"""
+    Initialize BCOTrainer from [BCO](https://huggingface.co/papers/2404.04656) paper.
+
+    Args:
+        model (`transformers.PreTrainedModel`):
+            The model to train, preferably an `AutoModelForSequenceClassification`.
+        ref_model (`PreTrainedModelWrapper`):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        args (`BCOConfig`):
+            The arguments to use for training.
+        train_dataset (`datasets.Dataset`):
+            The dataset to use for training.
+        eval_dataset (`datasets.Dataset`):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        data_collator (`transformers.DataCollator`, *optional*, defaults to `None`):
+            The data collator to use for training. If None is specified, the default data collator
+            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        model_adapter_name (`str`, defaults to `None`):
+            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+        ref_adapter_name (`str`, defaults to `None`):
+            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+    """
+
+    _tag_names = ["trl", "bco"]
+
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module, str] = None,
+        ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
+        args: BCOConfig = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        data_collator: Optional[DataCollator] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
+        model_adapter_name: Optional[str] = None,
+        ref_adapter_name: Optional[str] = None,
+        embedding_func: Optional[Callable] = None,
+        embedding_tokenizer: Optional[PreTrainedTokenizerBase] = None,
+    ):
+        if embedding_func is not None and not (is_sklearn_available() and is_joblib_available()):
+            raise ImportError(
+                "BCOTrainer with UDM requires the scikit-learn and joblib libraries. Please install it with `pip install scikit-learn joblib`."
+            )
+
+        if type(args) is TrainingArguments:
+            raise ValueError("Please use `BCOConfig` instead `TrainingArguments`.")
+
+        if not isinstance(model, str) and model is not None and ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must mass a copy of it, or `None` if you use peft."
+            )
+
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError("You passed model_kwargs to the BCOTrainer. But your model is already instantiated.")
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the BCOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+
+        if args.ref_model_init_kwargs is None:
+            ref_model_init_kwargs = {}
+        elif not isinstance(ref_model, str):
+            raise ValueError(
+                "You passed ref_model_kwargs to the BCOTrainer. But your ref_model is already instantiated."
+            )
+        else:
+            ref_model_init_kwargs = args.ref_model_init_kwargs
+            dtype = ref_model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the BCOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                ref_model_init_kwargs["dtype"] = dtype
+
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        if isinstance(ref_model, str):
+            ref_model = AutoModelForCausalLM.from_pretrained(ref_model, **ref_model_init_kwargs)
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it with `pip install peft` to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+
+                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+            # get peft model with the given config
+            model = model
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        if args.generate_during_eval and not (is_wandb_available() or is_comet_available()):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+
+        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+        self.model_adapter_name = model_adapter_name
+        self.ref_adapter_name = ref_adapter_name
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model or args.precompute_ref_log_probs:
+            # The `model` with adapters turned off will be used as the reference model
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(model)
+
+        if processing_class is None:
+            raise ValueError(
+                "max_length or a processing_class must be specified when using the default DPODataCollatorWithPadding"
+            )
+        if args.max_length is None:
+            logger.warning(
+                "When using DPODataCollatorWithPadding, you should set `max_length` in the `BCOConfig`. "
+                "It will be set to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        if args.max_length is not None:
+            max_length = args.max_length
+
+        if args.max_prompt_length is None:
+            logger.warning(
+                "When using DPODataCollatorWithPadding, you should set `max_prompt_length` in the `BCOConfig`. "
+                "It will be set to `128` by default, but you should do it yourself in the future.",
+            )
+            max_prompt_length = 128
+        if args.max_prompt_length is not None:
+            max_prompt_length = args.max_prompt_length
+
+        max_completion_length = None
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using DPODataCollatorWithPadding with an encoder decoder architecture, you should set `max_completion_length` in the BCOTrainer's init"
+                " it will be set to `128` by default, but you should do it yourself in the future.",
+            )
+            max_completion_length = 128
+        if args.max_completion_length is not None and self.is_encoder_decoder:
+            max_completion_length = args.max_completion_length
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                label_pad_token_id=args.label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your BCOConfig"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = args.truncation_mode
+        self.max_completion_length = max_completion_length
+        self.precompute_ref_log_probs = args.precompute_ref_log_probs
+
+        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
+        # keep track of first called to avoid computation of future calls
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+
+        # metric
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # BCO parameter
+        self.beta = args.beta
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+        # Underlying Distribution Matching argument
+        self.embedding_func = embedding_func
+        self.embedding_tokenizer = embedding_tokenizer
+
+        # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in BCO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids" and "completion_input_ids". As a result,
+        # the trainer issues the warning: "Could not estimate the number of tokens of the input, floating-point
+        # operations will not be computed." To suppress this warning, we set the "estimate_tokens" key in the model's
+        # "warnings_issued" dictionary to True. This acts as a flag to indicate that the warning has already been
+        # issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        with PartialState().main_process_first():
+            # Extract the prompt if needed
+            train_dataset = train_dataset.map(
+                maybe_extract_prompt, num_proc=args.dataset_num_proc, desc="Extracting prompt from train dataset"
+            )
+            # Unpair the dataset if needed
+            train_dataset = maybe_unpair_preference_dataset(
+                train_dataset, args.dataset_num_proc, desc="Unpairing train dataset"
+            )
+            # Apply the chat template if needed
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc
+            )
+            if eval_dataset is not None:
+                # Extract the prompt if needed
+                eval_dataset = eval_dataset.map(
+                    maybe_extract_prompt, num_proc=args.dataset_num_proc, desc="Extracting prompt from eval dataset"
+                )
+                # Unpair the dataset if needed
+                eval_dataset = maybe_unpair_preference_dataset(
+                    eval_dataset, args.dataset_num_proc, desc="Unpairing eval dataset"
+                )
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                )
+
+            # Tokenize and prepare the training datasets
+            train_dataset = train_dataset.map(
+                _tokenize,
+                batched=True,
+                fn_kwargs={"tokenizer": processing_class, "embedding_tokenizer": self.embedding_tokenizer},
+                num_proc=args.dataset_num_proc,
+                desc="Tokenizing train dataset",
+            )
+
+            # Prepare the datasets
+            fn_kwargs = {
+                "prefix": "",
+                "is_encoder_decoder": self.is_encoder_decoder,
+                "tokenizer": processing_class,
+                "max_length": self.max_length,
+                "truncation_mode": self.truncation_mode,
+                "label_pad_token_id": self.label_pad_token_id,
+                "max_prompt_length": self.max_prompt_length,
+                "max_completion_length": self.max_completion_length,
+            }
+            train_dataset = train_dataset.map(
+                _process_tokens,
+                fn_kwargs=fn_kwargs,
+                num_proc=args.dataset_num_proc,
+                desc="Processing tokenized train dataset",
+            )
+
+            if eval_dataset is not None:
+                # Tokenize
+                eval_dataset = eval_dataset.map(
+                    _tokenize,
+                    fn_kwargs={"tokenizer": processing_class, "embedding_tokenizer": self.embedding_tokenizer},
+                    batched=True,
+                    num_proc=args.dataset_num_proc,
+                    desc="Tokenizing eval dataset",
+                )
+
+                # Process
+                fn_kwargs = {
+                    "prefix": "",
+                    "is_encoder_decoder": self.is_encoder_decoder,
+                    "tokenizer": processing_class,
+                    "max_length": self.max_length,
+                    "truncation_mode": self.truncation_mode,
+                    "label_pad_token_id": self.label_pad_token_id,
+                    "max_prompt_length": self.max_prompt_length,
+                    "max_completion_length": self.max_completion_length,
+                }
+                eval_dataset = eval_dataset.map(
+                    _process_tokens,
+                    fn_kwargs=fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                    desc="Processing tokenized eval dataset",
+                )
+
+            desirable = train_dataset.filter(
+                lambda x: x["label"], num_proc=args.dataset_num_proc, desc="Filtering desirable examples"
+            )
+            undesirable = train_dataset.filter(
+                lambda x: not x["label"], num_proc=args.dataset_num_proc, desc="Filtering undesirable examples"
+            )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+        # Deepspeed Zero-3 does not support precompute_ref_log_probs
+        if self.is_deepspeed_enabled:
+            if self.accelerator.state.deepspeed_plugin.zero_stage == 3 and self.precompute_ref_log_probs:
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`."
+                )
+
+        if self.ref_model is None:
+            if not (self.is_peft_model or self.precompute_ref_log_probs):
+                raise ValueError(
+                    "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`"
+                )
+        else:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+
+        self.running = RunningMoments(accelerator=self.accelerator)
+
+        if self.embedding_func is None or args.resume_from_checkpoint:
+            return
+
+        chosen_embeddings = self._get_sample_prompt_embeddings(desirable, sample_size=self.args.prompt_sample_size)
+        rejected_embeddings = self._get_sample_prompt_embeddings(undesirable, sample_size=self.args.prompt_sample_size)
+
+        embeddings = torch.cat((chosen_embeddings, rejected_embeddings), dim=0)
+        labels = torch.cat(
+            (torch.ones_like(chosen_embeddings[:, 0]), torch.zeros_like(rejected_embeddings[:, 0])), dim=0
+        )
+
+        self.clf = LogisticRegression(class_weight="balanced").fit(
+            embeddings.cpu().float().numpy(), labels.cpu().numpy()
+        )
+        chosen_mean = self.clf.score(
+            chosen_embeddings.cpu().float().numpy(), torch.ones_like(chosen_embeddings[:, 0]).cpu().numpy()
+        )
+        rejected_mean = self.clf.score(
+            rejected_embeddings.cpu().float().numpy(), torch.zeros_like(rejected_embeddings[:, 0]).cpu().numpy()
+        )
+        logger.info(f"UDM classifier training scores: chosen: {chosen_mean}, rejected: {rejected_mean}")
+
+    @property
+    def match_underlying_distribution(self):
+        return self.embedding_func is not None and self.embedding_tokenizer is not None
+
+    def _get_chosen_prob(self, prompt_embeddings: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        Calculates the probability if the given prompt embedding is from desirable dataset. This function calculates
+        the probability in the process and ensemble across processes.
+        """
+        dtype = prompt_embeddings.dtype
+        device = prompt_embeddings.device
+        rank = self.accelerator.process_index
+
+        padded_prompt_embeddings = self.accelerator.pad_across_processes(
+            prompt_embeddings, pad_index=self.embedding_tokenizer.pad_token_id
+        )
+        sample_size = padded_prompt_embeddings.shape[0]
+        nonzero = padded_prompt_embeddings.mean(dim=1) != self.embedding_tokenizer.pad_token_id
+        prompt_embeddings = self.accelerator.gather(padded_prompt_embeddings)
+
+        # cannot predict for all empty values
+        if prompt_embeddings.shape[0] == 0:
+            return torch.tensor([], device=device, dtype=dtype)
+
+        prob = self.clf.predict_proba(prompt_embeddings.cpu().float().numpy())[:, 1]
+        prob = torch.as_tensor(prob, dtype=dtype, device=device)
+        prob = self.accelerator.reduce(prob, reduction="mean")
+
+        prob = prob[sample_size * rank : sample_size * (rank + 1)]
+        prob = prob[nonzero]
+
+        return prob
+
+    def _vectorize_prompt(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor) -> torch.FloatTensor:
+        """
+        Replaces processing_class.pad_token_id to embedding_tokenizer.pad_token_id and applies self.embedding_func
+        """
+        input_ids = torch.where(
+            input_ids == self.processing_class.pad_token_id,
+            self.embedding_tokenizer.pad_token_id,
+            input_ids,
+        )
+
+        with torch.no_grad():
+            embeddings = self.embedding_func(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )
+
+        return embeddings
+
+    def _get_prompt_embeddings(
+        self, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
+        """Extract embeddings from frozen embedding model"""
+
+        if not self.match_underlying_distribution:
+            return None, None
+
+        embeddings = self._vectorize_prompt(
+            input_ids=batch["embedding_input_ids"],
+            attention_mask=batch["embedding_attention_mask"],
+        )
+
+        labels = torch.tensor(batch["label"], dtype=torch.bool, device=embeddings.device)
+        chosen_idx = torch.where(labels)[0]
+        rejected_idx = torch.where(~labels)[0]
+
+        chosen_embeddings = embeddings[chosen_idx, ...]
+        rejected_embeddings = embeddings[rejected_idx, ...]
+
+        return (chosen_embeddings, rejected_embeddings)
+
+    def _get_sample_prompt_embeddings(self, dataset: Dataset, sample_size: int = 512) -> torch.FloatTensor:
+        """
+        Sample instances from dataset and get prompt embeddings. Used for density ratio classifier training.
+        """
+        n_samples = min(len(dataset), sample_size)
+        rand_indices = np.random.choice(len(dataset), size=(n_samples,))
+
+        embedding_dataset = dataset.select(rand_indices)
+
+        dataloader_params = {
+            "batch_size": self.args.per_device_train_batch_size,
+            "collate_fn": self.data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "shuffle": False,
+        }
+
+        # prepare dataloader
+        data_loader = self.accelerator.prepare(DataLoader(embedding_dataset, **dataloader_params))
+
+        with torch.no_grad():
+            all_embeddings = torch.empty(0)
+            for padded_batch in tqdm(iterable=data_loader, desc="Building sample prompt embeddings"):
+                embeddings = self._vectorize_prompt(
+                    input_ids=padded_batch["embedding_input_ids"],
+                    attention_mask=padded_batch["embedding_attention_mask"],
+                )
+                embeddings = self.accelerator.gather_for_metrics(embeddings)
+                all_embeddings = torch.cat((all_embeddings, embeddings.cpu()))
+
+        return all_embeddings
+
+    def _save_optimizer_and_scheduler(self, output_dir):
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        super()._save_optimizer_and_scheduler(output_dir)
+
+        if self.accelerator.is_main_process:
+            # When saving optimizer and scheduler to checkpoint, save also the running delta object.
+            self.running.save_to_json(os.path.join(output_dir, RUNNING_NAME))
+
+            if self.match_underlying_distribution:
+                joblib.dump(self.clf, os.path.join(output_dir, CLF_NAME), compress=True)
+
+    def _load_optimizer_and_scheduler(self, checkpoint):
+        if checkpoint is None:
+            logger.warning_once(f"Missing Checkpoint {checkpoint}")
+            return
+
+        super()._load_optimizer_and_scheduler(checkpoint)
+
+        # when loading optimizer and scheduler from checkpoint, also load the running delta object.
+        running_file = os.path.join(checkpoint, RUNNING_NAME)
+        if os.path.isfile(running_file):
+            self.running = RunningMoments.load_from_json(self.accelerator, running_file)
+
+        if self.match_underlying_distribution:
+            clf_file = os.path.join(checkpoint, CLF_NAME)
+            if os.path.isfile(clf_file):
+                self.clf = joblib.load(clf_file)
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.model_adapter_name or "default")
+
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`.
+        """
+
+        if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_train_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(DataLoader(self.train_dataset, **dataloader_params))
+            reference_completion_logps = []
+
+            for padded_batch in tqdm(iterable=data_loader, desc="Train dataset reference log probs"):
+                reference_completion_logp = self.compute_reference_log_probs(padded_batch)
+
+                reference_completion_logp = self.accelerator.gather_for_metrics(reference_completion_logp)
+                reference_completion_logps.append(reference_completion_logp.cpu())
+
+            self.train_dataset = self.train_dataset.add_column(
+                name="reference_logps", column=torch.cat(reference_completion_logps).float().numpy()
+            )
+
+            self._precomputed_train_ref_log_probs = True
+
+        return super().get_train_dataloader()
+
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`.
+
+        Args:
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
+                by the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_eval_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params))
+
+            reference_completion_logps = []
+
+            for padded_batch in tqdm(iterable=data_loader, desc="Eval dataset reference log probs"):
+                reference_completion_logp = self.compute_reference_log_probs(padded_batch)
+
+                reference_completion_logp = self.accelerator.gather_for_metrics(reference_completion_logp)
+                reference_completion_logps.append(reference_completion_logp.cpu())
+
+            eval_dataset = eval_dataset.add_column(
+                name="reference_logps", column=torch.cat(reference_completion_logps).float().numpy()
+            )
+
+            # Save calculated reference_chosen_logps and reference_rejected_logps to the eval_dataset for subsequent runs
+            if self.eval_dataset is not None:
+                self.eval_dataset = eval_dataset
+            self._precomputed_eval_ref_log_probs = True
+
+        return super().get_eval_dataloader(eval_dataset=eval_dataset)
+
+    def compute_reference_log_probs(self, padded_batch: dict) -> dict:
+        """Computes log probabilities of the reference model for a single padded batch of a BCO specific dataset."""
+        with torch.no_grad():
+            if self.ref_model is None:
+                with self.null_ref_context():
+                    if self.is_encoder_decoder:
+                        completion_logits = self.model(
+                            padded_batch["prompt_input_ids"],
+                            attention_mask=padded_batch["prompt_attention_mask"],
+                            decoder_input_ids=padded_batch.get("completion_decoder_input_ids"),
+                            labels=padded_batch["completion_labels"],
+                        ).logits
+
+                    else:
+                        completion_logits = self.model(
+                            padded_batch["completion_input_ids"],
+                            attention_mask=padded_batch["completion_attention_mask"],
+                        ).logits
+
+            else:
+                if self.is_encoder_decoder:
+                    completion_logits = self.ref_model(
+                        padded_batch["prompt_input_ids"],
+                        attention_mask=padded_batch["prompt_attention_mask"],
+                        decoder_input_ids=padded_batch.get("completion_decoder_input_ids"),
+                        labels=padded_batch["completion_labels"],
+                    ).logits
+
+                else:
+                    completion_logits = self.ref_model(
+                        padded_batch["completion_input_ids"], attention_mask=padded_batch["completion_attention_mask"]
+                    ).logits
+
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            padded_batch["completion_labels"],
+            average_log_prob=False,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        return completion_logps
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
+                ignored. Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            label_pad_token_id:
+                The label value to ignore when computing log probabilities.
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model. If True, the labels are not shifted, and the logits are
+                assumed to already be aligned with the labels. If False, the labels are shifted to the right by one
+                position, and the logits are assumed to be aligned with the shifted labels.
+
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        else:
+            # Fixes end-dec RuntimeError
+            labels = labels.clone()
+
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == label_pad_token_id] = 0
+
+        per_token_logps = selective_log_softmax(logits, labels)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+
+    def forward(
+        self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        model_kwargs = (
+            {
+                "labels": batch["completion_labels"],
+                "decoder_input_ids": batch.get("completion_decoder_input_ids"),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        outputs = model(
+            batch["completion_input_ids"],
+            attention_mask=batch["completion_attention_mask"],
+            **model_kwargs,
+        )
+        completion_logits = outputs.logits
+
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            batch["completion_labels"],
+            average_log_prob=False,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        if completion_logps.shape[0] != len(batch["label"]):
+            raise ValueError(
+                "There is a mismatch between the number of examples in this batch and the number of "
+                "examples for which an output sequence was predicted."
+            )
+
+        chosen_idx = [i for i in range(completion_logps.shape[0]) if batch["label"][i] is True]
+        rejected_idx = [i for i in range(completion_logps.shape[0]) if batch["label"][i] is False]
+
+        chosen_logps = completion_logps[chosen_idx, ...]
+        rejected_logps = completion_logps[rejected_idx, ...]
+
+        chosen_logits = completion_logits[chosen_idx, ...]
+        rejected_logits = completion_logits[rejected_idx, ...]
+
+        if self.aux_loss_enabled:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, outputs.aux_loss)
+        else:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits)
+
+    def _get_udm_weight(self, rejected_embeddings: torch.FloatTensor) -> torch.FloatTensor:
+        prob_desirable = self._get_chosen_prob(rejected_embeddings)
+        min_ratio = self.args.min_density_ratio
+        max_ratio = self.args.max_density_ratio
+
+        weight = (prob_desirable / (1 - prob_desirable + 1e-8)).clamp(min=min_ratio, max=max_ratio)
+
+        return weight
+
+    def bco_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        reference_chosen_logps: torch.FloatTensor,
+        reference_rejected_logps: torch.FloatTensor,
+        chosen_embeddings: Optional[torch.FloatTensor],
+        rejected_embeddings: Optional[torch.FloatTensor],
+        do_train: bool = True,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute the BCO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (num(chosen) in batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (num(rejected) in batch_size,)
+            reference_chosen_logps:
+                Log probabilities of the reference model for the chosen responses. Shape: (num(chosen) in batch_size,)
+            reference_rejected_logps:
+                Log probabilities of the reference model for the rejected responses. Shape: (num(rejected) in
+                batch_size,)
+            chosen_embeddings: embeddings of desirable prompts
+            rejected_embeddings: embeddings of undesirable prompts
+            do_train: whether to update the running delta value. Default is True.
+
+        Returns:
+            A tuple of four tensors: (losses, chosen_rewards, rejected_rewards, delta). The losses tensor contains the
+            BCO loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards
+            for the chosen and rejected responses, respectively. The delta value contains the moving average of all
+            implicit rewards.
+        """
+
+        chosen_logratios = policy_chosen_logps - reference_chosen_logps
+        chosen_rewards = self.beta * chosen_logratios
+
+        rejected_logratios = policy_rejected_logps - reference_rejected_logps
+        rejected_rewards = self.beta * rejected_logratios
+
+        if do_train:
+            self.running.update(torch.cat((chosen_rewards, rejected_rewards), 0).detach())
+        delta = torch.as_tensor(self.running.mean, device=chosen_rewards.device)
+
+        chosen_losses = -F.logsigmoid(chosen_rewards - delta)
+        rejected_losses = -F.logsigmoid(-(rejected_rewards - delta))
+
+        if self.match_underlying_distribution:
+            chosen_weight = torch.ones_like(chosen_losses)
+            rejected_weight = self._get_udm_weight(rejected_embeddings)
+
+            losses = torch.cat((chosen_weight * chosen_losses, rejected_weight * rejected_losses), dim=0)
+        else:
+            losses = torch.cat((chosen_losses, rejected_losses), dim=0)
+
+        return losses, chosen_rewards, rejected_rewards, delta
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, Union[list, torch.LongTensor]],
+        do_train: bool = True,
+    ):
+        """Compute the BCO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        batch = {k: (v.to(self.accelerator.device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
+
+        forward_output = self.forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+        ) = forward_output[:4]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[4]
+
+        # if reference_logps in batch use them, otherwise use the reference model
+        if "reference_logps" in batch:
+            chosen_idx = [i for i in range(batch["reference_logps"].shape[0]) if batch["label"][i] is True]
+            rejected_idx = [i for i in range(batch["reference_logps"].shape[0]) if batch["label"][i] is False]
+
+            reference_chosen_logps = batch["reference_logps"][chosen_idx, ...]
+            reference_rejected_logps = batch["reference_logps"][rejected_idx, ...]
+        else:
+            with torch.no_grad():
+                if self.ref_model is None:
+                    with self.null_ref_context():
+                        (
+                            reference_chosen_logps,
+                            reference_rejected_logps,
+                            _,
+                            _,
+                        ) = self.forward(self.model, batch)[:4]
+                else:
+                    (
+                        reference_chosen_logps,
+                        reference_rejected_logps,
+                        _,
+                        _,
+                    ) = self.forward(self.ref_model, batch)[:4]
+
+        chosen_embeddings, rejected_embeddings = self._get_prompt_embeddings(batch)
+
+        losses, chosen_rewards, rejected_rewards, delta = self.bco_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+            reference_chosen_logps,
+            reference_rejected_logps,
+            chosen_embeddings,
+            rejected_embeddings,
+            do_train=do_train,
+        )
+        metrics["delta"] = self.accelerator.gather_for_metrics(delta).mean().item()
+
+        num_chosen = torch.Tensor([len(chosen_rewards)]).to(self.accelerator.device)
+        num_rejected = torch.Tensor([len(rejected_rewards)]).to(self.accelerator.device)
+
+        all_num_chosen = self.accelerator.gather_for_metrics(num_chosen).sum().item()
+        all_num_rejected = self.accelerator.gather_for_metrics(num_rejected).sum().item()
+
+        if all_num_chosen > 0:
+            metrics["rewards/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(chosen_rewards.nansum()).nansum().item()
+            )
+            metrics["logps/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(policy_chosen_logps.nansum()).nansum().item()
+            )
+            metrics["logits/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(policy_chosen_logits.nansum()).nansum().item()
+            )
+            metrics["count/chosen"] = all_num_chosen
+
+        if all_num_rejected > 0:
+            metrics["rewards/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(rejected_rewards.nansum()).nansum().item()
+            )
+            metrics["logps/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(policy_rejected_logps.nansum()).nansum().item()
+            )
+            metrics["logits/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(policy_rejected_logits.nansum()).nansum().item()
+            )
+            metrics["count/rejected"] = all_num_rejected
+
+        loss = losses.nanmean()
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+    def compute_loss(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs)
+
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+        # force log the metrics
+        if self.accelerator.is_main_process:
+            self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def _get_train_sampler(self, dataset: Optional[Dataset] = None) -> Optional[torch.utils.data.Sampler]:
+        if dataset is None:
+            dataset = self.train_dataset
+        if dataset is None or not has_length(dataset):
+            return None
+        return SequentialSampler(dataset)
+
+    def generate_from_model_and_ref(self, model, batch: dict[str, torch.LongTensor]) -> tuple[str, str]:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+
+            # if reference_output in batch use that otherwise use the reference model
+            if "reference_output" in batch:
+                reference_output = batch["reference_output"]
+            else:
+                if self.ref_model is None:
+                    with self.null_ref_context():
+                        reference_output = self.model.generate(
+                            input_ids=batch["prompt_input_ids"],
+                            attention_mask=batch["prompt_attention_mask"],
+                            max_length=self.max_length,
+                            do_sample=True,
+                            pad_token_id=self.processing_class.pad_token_id,
+                        )
+                else:
+                    reference_output = self.ref_model.generate(
+                        input_ids=batch["prompt_input_ids"],
+                        attention_mask=batch["prompt_attention_mask"],
+                        max_length=self.max_length,
+                        do_sample=True,
+                        pad_token_id=self.processing_class.pad_token_id,
+                    )
+
+        policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id)
+        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
+
+        reference_output = pad_to_length(reference_output, self.max_length, self.processing_class.pad_token_id)
+        reference_output_decoded = self.processing_class.batch_decode(reference_output, skip_special_tokens=True)
+
+        return policy_output_decoded, reference_output_decoded
+
+    def prediction_step(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[list[str]] = None,
+    ):
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, do_train=False)
+
+        # force log the metrics
+        if self.accelerator.is_main_process:
+            self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {}
+        if "logits/chosen_sum" in metrics:
+            logits_dict["eval_logits/chosen"] = metrics["logits/chosen_sum"]
+        if "logits/rejected_sum" in metrics:
+            logits_dict["eval_logits/rejected"] = metrics["logits/rejected_sum"]
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[list[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            target_labels = torch.tensor(random_batch["label"], dtype=torch.bool, device=self.accelerator.device)
+            target_indices = torch.where(~target_labels)[0]
+            target_batch = {
+                "prompt_input_ids": random_batch["prompt_input_ids"][target_indices],
+                "prompt_attention_mask": random_batch["prompt_attention_mask"][target_indices],
+                "prompt": itemgetter(*target_indices)(random_batch["prompt"]),
+            }
+            policy_output_decoded, ref_output_decoded = self.generate_from_model_and_ref(self.model, target_batch)
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy", "Ref Model"],
+                data=[
+                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
+                    for prompt, pol, ref in zip(target_batch["prompt"], policy_output_decoded, ref_output_decoded)
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float` or `None`, *optional*, defaults to `None`):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # train metrics should have no prefix, eval should have 'eval_'
+        prefix = "eval_" if train_eval == "eval" else ""
+        # accumulate average metrics from sums and lengths
+        for split in ["chosen", "rejected"]:
+            if f"count/{split}" in self._stored_metrics[train_eval]:
+                count_sum = torch.Tensor(self._stored_metrics[train_eval][f"count/{split}"]).sum().item()
+                for metric in ["rewards", "logps", "logits"]:
+                    logs[f"{prefix}{metric}/{split}"] = (
+                        torch.Tensor(self._stored_metrics[train_eval][f"{metric}/{split}_sum"]).sum().item()
+                        / count_sum
+                    )
+                    # delete obsolete metric
+                    del self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
+                del self._stored_metrics[train_eval][f"count/{split}"]
+        # calculate reward margin
+        if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs:
+            logs[f"{prefix}rewards/margins"] = logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"]
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[f"{prefix}{key}"] = torch.Tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @article{jung2024binary,
+            title        = {{Binary Classifier Optimization for Large Language Model Alignment}},
+            author       = {Seungjae Jung and Gunsoo Han and Daniel Wontae Nam and Kyoung{-}Woon On},
+            year         = 2024,
+            eprint       = {arXiv:2404.04656}
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="BCO",
+            trainer_citation=citation,
+            paper_title="Binary Classifier Optimization for Large Language Model Alignment",
+            paper_id="2404.04656",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothBCOTrainer(_UnslothBCOTrainer):
+    """
+    
+Initialize BCOTrainer from [BCO](https://huggingface.co/papers/2404.04656) paper.
+
+Args:
+    model (`transformers.PreTrainedModel`):
+        The model to train, preferably an `AutoModelForSequenceClassification`.
+    ref_model (`PreTrainedModelWrapper`):
+        Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+        and loss. If no reference model is provided, the trainer will create a reference model with the same
+        architecture as the model to be optimized.
+    args (`BCOConfig`):
+        The arguments to use for training.
+    train_dataset (`datasets.Dataset`):
+        The dataset to use for training.
+    eval_dataset (`datasets.Dataset`):
+        The dataset to use for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+        Processing class used to process the data. If provided, will be used to automatically process the inputs
+        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+        reuse the fine-tuned model.
+    data_collator (`transformers.DataCollator`, *optional*, defaults to `None`):
+        The data collator to use for training. If None is specified, the default data collator
+        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+        sequences in the batch, given a dataset of paired sequences.
+    model_init (`Callable[[], transformers.PreTrainedModel]`):
+        The model initializer to use for training. If None is specified, the default model initializer will be
+        used.
+    callbacks (`list[transformers.TrainerCallback]`):
+        The callbacks to use for training.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        The optimizer and scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+        The function to use to preprocess the logits before computing the metrics.
+    peft_config (`dict`, defaults to `None`):
+        The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+        a PEFT model.
+    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+        metric values.
+    model_adapter_name (`str`, defaults to `None`):
+        Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+    ref_adapter_name (`str`, defaults to `None`):
+        Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+
+    """
+    def __init__(
+        self,
+        model = None,
+        ref_model = None,
+        args = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        data_collator = None,
+        model_init = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        peft_config = None,
+        compute_metrics = None,
+        model_adapter_name = None,
+        ref_adapter_name = None,
+        embedding_func = None,
+        embedding_tokenizer = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothBCOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('bco_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            ref_model = ref_model,
+            args = args,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            data_collator = data_collator,
+            model_init = model_init,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            peft_config = peft_config,
+            compute_metrics = compute_metrics,
+            model_adapter_name = model_adapter_name,
+            ref_adapter_name = ref_adapter_name,
+            embedding_func = embedding_func,
+            embedding_tokenizer = embedding_tokenizer,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothCPOTrainer.py b/unsloth_compiled_cache/UnslothCPOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..728a6072733c67e6a1d3c40192d28a5478f4b2c5
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothCPOTrainer.py
@@ -0,0 +1,1866 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.cpo_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, CPOConfig, CPOTrainer, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Literal, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, Trainer, TrainerCallback, Union, add_bos_token_if_needed, add_eos_token_if_needed, autocast, defaultdict, disable_dropout_in_model, generate_model_card, get_comet_experiment_url, inspect, is_comet_available, is_peft_available, is_torch_fx_proxy, is_wandb_available, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_model_for_kbit_training, random, selective_log_softmax, textwrap, torch, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, logger, os, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothCPOConfig(CPOConfig):
+    """
+    
+Configuration class for the [`CPOTrainer`].
+
+This class includes only the parameters that are specific to CPO training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    max_length (`int` or `None`, *optional*, defaults to `1024`):
+        Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+        to use the default data collator.
+    max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+        Maximum length of the prompt. This argument is required if you want to use the default data collator.
+    max_completion_length (`int` or `None`, *optional*, defaults to `None`):
+        Maximum length of the completion. This argument is required if you want to use the default data collator
+        and your model is an encoder-decoder.
+    beta (`float`, *optional*, defaults to `0.1`):
+        Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+        reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+        the [paper](https://huggingface.co/papers/2310.12036).
+    label_smoothing (`float`, *optional*, defaults to `0.0`):
+        Label smoothing factor. This argument is required if you want to use the default data collator.
+    loss_type (`str`, *optional*, defaults to `"sigmoid"`):
+        Type of loss to use. Possible values are:
+
+            - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+            - `"hinge"`: hinge loss on the normalized likelihood from the
+              [SLiC](https://huggingface.co/papers/2305.10425) paper.
+            - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+            - `"simpo"`: SimPO loss from the [SimPO](https://huggingface.co/papers/2405.14734) paper.
+            - `"alphapo"`: AlphaPO loss from the [AlphaPO](https://huggingface.co/papers/2501.03884) paper. This
+              automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`.
+
+    disable_dropout (`bool`, *optional*, defaults to `True`):
+        Whether to disable dropout in the model.
+    cpo_alpha (`float`, *optional*, defaults to `1.0`):
+        Weight of the BC regularizer in CPO training.
+    simpo_gamma (`float`, *optional*, defaults to `0.5`):
+        Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`.
+    alpha (`float`, *optional*, defaults to `0.0`):
+        Alpha parameter that controls reward function shape across all loss types. When alpha=0 (default), uses
+        standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: `r = (1 - p^(-alpha))
+        / alpha` from the [AlphaPO paper](https://huggingface.co/papers/2501.03884). This parameter works with all
+        loss types.
+    label_pad_token_id (`int`, *optional*, defaults to `-100`):
+        Label pad token id. This argument is required if you want to use the default data collator.
+    padding_value (`int` or `None`, *optional*, defaults to `None`):
+        Padding value to use. If `None`, the padding value of the tokenizer is used.
+    truncation_mode (`str`,*optional*,  defaults to `"keep_end"`):
+        Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+        This argument is required if you want to use the default data collator.
+    generate_during_eval (`bool`, *optional*, defaults to `False`):
+        If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
+    is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`):
+        When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+        you need to specify if the model returned by the callable is an encoder-decoder model.
+    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+        string.
+    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        Number of processes to use for processing the dataset.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        max_length = 1024,
+        max_prompt_length = 512,
+        max_completion_length = None,
+        beta = 0.1,
+        label_smoothing = 0.0,
+        loss_type = 'sigmoid',
+        disable_dropout = True,
+        cpo_alpha = 1.0,
+        simpo_gamma = 0.5,
+        alpha = 0.0,
+        label_pad_token_id = -100,
+        padding_value = None,
+        truncation_mode = 'keep_end',
+        generate_during_eval = False,
+        is_encoder_decoder = None,
+        model_init_kwargs = None,
+        dataset_num_proc = None,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            max_length = max_length,
+            max_prompt_length = max_prompt_length,
+            max_completion_length = max_completion_length,
+            beta = beta,
+            label_smoothing = label_smoothing,
+            loss_type = loss_type,
+            disable_dropout = disable_dropout,
+            cpo_alpha = cpo_alpha,
+            simpo_gamma = simpo_gamma,
+            alpha = alpha,
+            label_pad_token_id = label_pad_token_id,
+            padding_value = padding_value,
+            truncation_mode = truncation_mode,
+            generate_during_eval = generate_during_eval,
+            is_encoder_decoder = is_encoder_decoder,
+            model_init_kwargs = model_init_kwargs,
+            dataset_num_proc = dataset_num_proc,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothCPOTrainer(Trainer):
+    r"""
+    Initialize CPOTrainer.
+
+    Args:
+        model (`transformers.PreTrainedModel`):
+            The model to train, preferably an `AutoModelForSequenceClassification`.
+        args (`CPOConfig`):
+            The CPO config arguments to use for training.
+        data_collator (`transformers.DataCollator`):
+            The data collator to use for training. If None is specified, the default data collator
+            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset (`datasets.Dataset`):
+            The dataset to use for training.
+        eval_dataset (`datasets.Dataset`):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+    """
+
+    _tag_names = ["trl", "cpo"]
+
+    def __init__(
+        self,
+        model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
+        args: Optional[CPOConfig] = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
+    ):
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError("You passed model_kwargs to the CPOTrainer. But your model is already instantiated.")
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the CPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+
+                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+            # get peft model with the given config
+            model = model
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        if args.generate_during_eval and not (is_wandb_available() or is_comet_available()):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+
+        if self.is_encoder_decoder:
+            self.decoder_start_token_id = model.config.decoder_start_token_id
+            self.pad_token_id = model.config.pad_token_id
+
+        if processing_class is None:
+            raise ValueError("processing_class must be specified to tokenize a CPO dataset.")
+        if args.max_length is None:
+            logger.warning(
+                "`max_length` is not set in the CPOConfig's init"
+                " it will default to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        else:
+            max_length = args.max_length
+        if args.max_prompt_length is None:
+            logger.warning(
+                "`max_prompt_length` is not set in the CPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            max_prompt_length = 128
+        else:
+            max_prompt_length = args.max_prompt_length
+
+        if not max_prompt_length < max_length:
+            raise ValueError(
+                f"max_prompt_length ({max_prompt_length}) should be strictly less than max_length ({max_length})."
+            )
+
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using an encoder decoder architecture, you should set `max_completion_length` in the CPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            max_completion_length = 128
+        else:
+            max_completion_length = args.max_completion_length
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                label_pad_token_id=args.label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = args.truncation_mode
+        self.max_completion_length = max_completion_length
+        self.processing_class = processing_class
+
+        if args.loss_type in ["hinge", "ipo"] and args.label_smoothing > 0:
+            logger.warning(
+                f"You are using the {args.loss_type} loss type that does not support label smoothing. The "
+                "`label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this warning.",
+            )
+        if args.loss_type == "kto_pair":
+            raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.")
+
+        self.beta = args.beta
+        self.label_smoothing = args.label_smoothing
+        self.loss_type = args.loss_type
+        self.cpo_alpha = args.cpo_alpha
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+        if args.loss_type == "simpo":
+            self.simpo_gamma = args.simpo_gamma
+
+        # AlphaPO parameter for reward shaping
+        self.alpha = args.alpha
+
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in CPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
+        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        with PartialState().main_process_first():
+            # Extract the prompt if needed, and apply the chat template if needed
+            train_dataset = train_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                )
+
+            # tokenize the dataset
+            train_dataset = train_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+    def build_tokenized_answer(self, prompt, answer):
+        """
+        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
+        b)[len(enc(a)):]`. Reference:
+            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+        """
+
+        full_tokenized = self.processing_class(prompt + answer, add_special_tokens=False)
+        prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)["input_ids"]
+
+        answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
+        answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :]
+
+        # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+        full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])
+
+        # Prepare input tokens for token by token comparison
+        full_input_ids = np.array(full_tokenized["input_ids"])
+
+        if len(full_input_ids) != len(full_concat_input_ids):
+            raise ValueError("Prompt input ids and answer input ids should have the same length.")
+
+        # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+        # can be merged together when tokenizing prompt+answer. This could result
+        # on the last token from the prompt being different when tokenized on its own
+        # vs when done as prompt+answer.
+        response_token_ids_start_idx = len(prompt_input_ids)
+
+        # If tokenized prompt is different than both prompt+answer, then it means the
+        # last token has changed due to merging.
+        if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]:
+            response_token_ids_start_idx -= 1
+
+        prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
+        prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx]
+
+        if len(prompt_input_ids) != len(prompt_attention_mask):
+            raise ValueError("Prompt input ids and attention mask should have the same length.")
+
+        answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
+        answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]
+
+        return dict(
+            prompt_input_ids=prompt_input_ids,
+            prompt_attention_mask=prompt_attention_mask,
+            input_ids=answer_input_ids,
+            attention_mask=answer_attention_mask,
+        )
+
+    def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> dict:
+        """Tokenize a single row from a CPO specific dataset.
+
+        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
+        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
+        we truncate the chosen/rejected.
+
+        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
+        of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens.
+        """
+        batch = {}
+        prompt = feature["prompt"]
+        chosen = feature["chosen"]
+        rejected = feature["rejected"]
+
+        if not self.is_encoder_decoder:
+            # Check issues below for more details
+            #  1. https://github.com/huggingface/trl/issues/907
+            #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+            #  3. https://github.com/LianjiaTech/BELLE/issues/337
+
+            if not isinstance(prompt, str):
+                raise ValueError(f"prompt should be an str but got {type(prompt)}")
+            prompt_tokens = self.processing_class(prompt, add_special_tokens=False)
+            prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
+
+            if not isinstance(chosen, str):
+                raise ValueError(f"chosen should be an str but got {type(chosen)}")
+            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
+
+            if not isinstance(rejected, str):
+                raise ValueError(f"rejected should be an str but got {type(rejected)}")
+            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
+
+            # Last prompt token might get merged by tokenizer and
+            # it should not be included for generation if that happens
+            prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"])
+
+            chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"])
+            rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"])
+            prompt_len_input_ids = min(chosen_prompt_len_input_ids, rejected_prompt_len_input_ids)
+
+            for k, v in prompt_tokens.items():
+                prompt_tokens[k] = v[:prompt_len_input_ids]
+
+            # Make sure prompts only have one different token at most an
+            # and length only differs by 1 at most
+            num_diff_tokens = sum(
+                [a != b for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"])]
+            )
+            num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids)
+            if num_diff_tokens > 1 or num_diff_len > 1:
+                raise ValueError(
+                    "Chosen and rejected prompt_input_ids might only differ on the "
+                    "last token due to tokenizer merge ops."
+                )
+
+            # add BOS token to head of prompt. Avoid adding if it's already there
+            prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed(
+                self.processing_class.bos_token_id,
+                prompt_len_input_ids,
+                prompt_tokens,
+                chosen_prompt_len_input_ids,
+                chosen_tokens,
+                rejected_prompt_len_input_ids,
+                rejected_tokens,
+            )
+
+            # add EOS token to end of answer. Avoid adding if it's already there
+            chosen_tokens, rejected_tokens = add_eos_token_if_needed(
+                self.processing_class.eos_token_id, chosen_tokens, rejected_tokens
+            )
+
+            longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"]))
+
+            # if combined sequence is too long, truncate the prompt
+            for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    if self.truncation_mode == "keep_start":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][: self.max_prompt_length]
+                    elif self.truncation_mode == "keep_end":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][-self.max_prompt_length :]
+                    else:
+                        raise ValueError(f"Unknown truncation mode: {self.truncation_mode}")
+
+            # if that's still too long, truncate the response
+            for answer_tokens in [chosen_tokens, rejected_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    for k in ["input_ids", "attention_mask"]:
+                        answer_tokens[k] = answer_tokens[k][: self.max_length - self.max_prompt_length]
+
+            # Create labels
+            chosen_sequence_tokens = {
+                k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"]
+            }
+            rejected_sequence_tokens = {
+                k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"]
+            }
+            chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
+            chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [
+                self.label_pad_token_id
+            ] * len(chosen_tokens["prompt_input_ids"])
+            rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:]
+            rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [
+                self.label_pad_token_id
+            ] * len(rejected_tokens["prompt_input_ids"])
+
+            for k, toks in {
+                "chosen_": chosen_sequence_tokens,
+                "rejected_": rejected_sequence_tokens,
+                "": prompt_tokens,
+            }.items():
+                for type_key, tokens in toks.items():
+                    if type_key == "token_type_ids":
+                        continue
+                    batch[f"{k}{type_key}"] = tokens
+
+        else:
+            chosen_tokens = self.processing_class(
+                chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
+            )
+            rejected_tokens = self.processing_class(
+                rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
+            )
+            prompt_tokens = self.processing_class(
+                prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True
+            )
+
+            batch["chosen_labels"] = chosen_tokens["input_ids"]
+            batch["rejected_labels"] = rejected_tokens["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokens["input_ids"]
+            batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
+
+            if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
+                batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["rejected_labels"])
+                )
+                batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["chosen_labels"])
+                )
+
+        return batch
+
+    @staticmethod
+    def concatenated_inputs(
+        batch: dict[str, Union[list, torch.LongTensor]],
+        is_encoder_decoder: bool = False,
+        label_pad_token_id: int = -100,
+        padding_value: int = 0,
+        device: Optional[torch.device] = None,
+    ) -> dict[str, torch.LongTensor]:
+        """Concatenate the chosen and rejected inputs into a single tensor.
+
+        Args:
+            batch:
+                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
+                of shape (batch_size, sequence_length).
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model.
+            label_pad_token_id:
+                The label pad token id.
+            padding_value:
+                The padding value to use for the concatenated inputs_ids.
+            device:
+                The device for the concatenated inputs.
+
+        Returns:
+            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
+        """
+        concatenated_batch = {}
+
+        if is_encoder_decoder:
+            max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1])
+        else:
+            max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
+
+        for k in batch:
+            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("chosen", "concatenated")
+                concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value)
+        for k in batch:
+            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("rejected", "concatenated")
+                concatenated_batch[concatenated_key] = torch.cat(
+                    (
+                        concatenated_batch[concatenated_key],
+                        pad_to_length(batch[k], max_length, pad_value=pad_value),
+                    ),
+                    dim=0,
+                ).to(device=device)
+
+        if is_encoder_decoder:
+            concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device)
+            concatenated_batch["concatenated_attention_mask"] = (
+                batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
+            )
+
+        return concatenated_batch
+
+    def cpo_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute the CPO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the CPO
+            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
+            the chosen and rejected responses, respectively.
+        """
+        # Apply AlphaPO reward transformation if alpha != 0
+        if self.alpha != 0.0:
+            # Compute probabilities
+            chosen_probs = torch.exp(policy_chosen_logps)
+            rejected_probs = torch.exp(policy_rejected_logps)
+
+            # Apply AlphaPO transformation: r = (1 - p^(-alpha)) / alpha
+            policy_chosen_rewards = (1 - chosen_probs.pow(-self.alpha)) / self.alpha
+            policy_rejected_rewards = (1 - rejected_probs.pow(-self.alpha)) / self.alpha
+
+            logits = (policy_chosen_rewards - policy_rejected_rewards).to(self.accelerator.device)
+        else:
+            # Standard log probability rewards when alpha = 0
+            logits = (policy_chosen_logps - policy_rejected_logps).to(self.accelerator.device)
+
+        # The beta is a temperature parameter for the CPO loss, typically something in the range of 0.1 to 0.5.
+        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and
+        # calculates a conservative CPO loss.
+
+        if self.loss_type == "simpo":
+            gamma_logratios = self.simpo_gamma / self.beta
+            logits = logits - gamma_logratios
+            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        elif self.loss_type == "sigmoid":
+            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        elif self.loss_type == "hinge":
+            losses = torch.relu(1 - self.beta * logits)
+        elif self.loss_type == "ipo":
+            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper.
+            losses = (logits - 1 / (2 * self.beta)) ** 2
+        else:
+            raise ValueError(
+                f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'simpo']"
+            )
+
+        # Calculate rewards for logging
+        if self.alpha != 0.0:
+            # When using AlphaPO transformation, use the transformed rewards
+            chosen_rewards = self.beta * policy_chosen_rewards.to(self.accelerator.device).detach()
+            rejected_rewards = self.beta * policy_rejected_rewards.to(self.accelerator.device).detach()
+        else:
+            # Standard log probability rewards
+            chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
+            rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach()
+
+        return losses, chosen_rewards, rejected_rewards
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
+                ignored. Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            label_pad_token_id: The label pad token id.
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
+
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == label_pad_token_id] = 0
+
+        per_token_logps = selective_log_softmax(logits, labels)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+
+    def concatenated_forward(
+        self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+        len_chosen = batch["chosen_labels"].shape[0]
+
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(concatenated_batch["concatenated_labels"]),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        outputs = model(
+            concatenated_batch["concatenated_input_ids"],
+            attention_mask=concatenated_batch["concatenated_attention_mask"],
+            use_cache=False,
+            **model_kwargs,
+        )
+        all_logits = outputs.logits
+
+        def cross_entropy_loss(logits, labels):
+            if not self.is_encoder_decoder:
+                # Shift so that tokens < n predict n
+                logits = logits[..., :-1, :].contiguous()
+                labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            logits = logits.view(-1, logits.shape[-1])
+            labels = labels.view(-1)
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+            return loss
+
+        labels = concatenated_batch["concatenated_labels"].clone()
+
+        if self.cpo_alpha == 0:
+            nll_loss = torch.tensor(0.0).to(self.accelerator.device)
+        else:
+            nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
+
+        all_logps = self.get_batch_logps(
+            all_logits,
+            concatenated_batch["concatenated_labels"],
+            average_log_prob=self.loss_type in ["ipo", "simpo"],
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        chosen_logps = all_logps[:len_chosen]
+        rejected_logps = all_logps[len_chosen:]
+
+        chosen_logits = all_logits[:len_chosen]
+        rejected_logits = all_logits[len_chosen:]
+
+        if self.aux_loss_enabled:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss, outputs.aux_loss)
+
+        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss)
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, Union[list, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the CPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+
+        forward_output = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = forward_output[:5]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[5]
+
+        losses, chosen_rewards, rejected_rewards = self.cpo_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+        )
+
+        loss = losses.mean() + self.cpo_alpha * policy_nll_loss
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
+        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
+        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
+        metrics[f"{prefix}rewards/margins"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards).mean().item()
+        )
+        metrics[f"{prefix}logps/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean().item()
+        )
+        metrics[f"{prefix}logps/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean().item()
+        )
+        metrics[f"{prefix}logits/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logits.detach().mean()).mean().item()
+        )
+        metrics[f"{prefix}logits/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logits.detach().mean()).mean().item()
+        )
+        metrics[f"{prefix}nll_loss"] = self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean().item()
+
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+    def compute_loss(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+
+        policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id)
+        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
+
+        return policy_output_decoded
+
+    def prediction_step(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[list[str]] = None,
+    ):
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval")
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[list[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded = self.generate_from_model(self.model, random_batch)
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy"],
+                data=[
+                    [prompt, pol[len(prompt) :]] for prompt, pol in zip(random_batch["prompt"], policy_output_decoded)
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float` or `None`, *optional*, defaults to `None`):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    def _shift_right(self, input_ids):
+        if self.decoder_start_token_id is None:
+            raise ValueError(
+                "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), self.decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = self.decoder_start_token_id
+
+        if self.pad_token_id is None:
+            raise ValueError("model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id)
+
+        return shifted_input_ids
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @inproceedings{xu2024contrastive,
+            title        = {{Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation}},
+            author       = {Haoran Xu and Amr Sharaf and Yunmo Chen and Weiting Tan and Lingfeng Shen and Benjamin Van Durme and Kenton Murray and Young Jin Kim},
+            year         = 2024,
+            booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
+            publisher    = {OpenReview.net},
+            url          = {https://openreview.net/forum?id=51iwkioZpn}
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="CPO",
+            trainer_citation=citation,
+            paper_title="Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation",
+            paper_id="2401.08417",
+        )
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothCPOTrainer(_UnslothCPOTrainer):
+    """
+    
+Initialize CPOTrainer.
+
+Args:
+    model (`transformers.PreTrainedModel`):
+        The model to train, preferably an `AutoModelForSequenceClassification`.
+    args (`CPOConfig`):
+        The CPO config arguments to use for training.
+    data_collator (`transformers.DataCollator`):
+        The data collator to use for training. If None is specified, the default data collator
+        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+        sequences in the batch, given a dataset of paired sequences.
+    train_dataset (`datasets.Dataset`):
+        The dataset to use for training.
+    eval_dataset (`datasets.Dataset`):
+        The dataset to use for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+        Processing class used to process the data. If provided, will be used to automatically process the inputs
+        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+        reuse the fine-tuned model.
+    model_init (`Callable[[], transformers.PreTrainedModel]`):
+        The model initializer to use for training. If None is specified, the default model initializer will be
+        used.
+    callbacks (`list[transformers.TrainerCallback]`):
+        The callbacks to use for training.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        The optimizer and scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+        The function to use to preprocess the logits before computing the metrics.
+    peft_config (`dict`, defaults to `None`):
+        The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+        a PEFT model.
+    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+        metric values.
+
+    """
+    def __init__(
+        self,
+        model = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        model_init = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        peft_config = None,
+        compute_metrics = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothCPOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('cpo_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            model_init = model_init,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            peft_config = peft_config,
+            compute_metrics = compute_metrics,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothDDPOTrainer.py b/unsloth_compiled_cache/UnslothDDPOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4aae1277c953246c182ac42ee1384ba7291ffab
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothDDPOTrainer.py
@@ -0,0 +1,1081 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.ddpo_trainer import (Accelerator, Any, Callable, DDPOConfig, DDPOStableDiffusionPipeline, DDPOTrainer, Optional, Path, PerPromptStatTracker, ProjectConfiguration, PyTorchModelHubMixin, Union, defaultdict, futures, generate_model_card, get_comet_experiment_url, is_wandb_available, logger, logging, os, set_seed, textwrap, torch, warnings)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothDDPOConfig(DDPOConfig):
+    """
+    
+Configuration class for the [`DDPOTrainer`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`):
+        Name of this experiment (by default is the file name without the extension name).
+    run_name (`str`, *optional*, defaults to `""`):
+        Name of this run.
+    seed (`int`, *optional*, defaults to `0`):
+        Random seed.
+    log_with (`Literal["wandb", "tensorboard"]]` or `None`, *optional*, defaults to `None`):
+        Log with either 'wandb' or 'tensorboard', check
+        https://huggingface.co/docs/accelerate/usage_guides/tracking for more details.
+    tracker_kwargs (`Dict`, *optional*, defaults to `{}`):
+        Keyword arguments for the tracker (e.g. wandb_project).
+    accelerator_kwargs (`Dict`, *optional*, defaults to `{}`):
+        Keyword arguments for the accelerator.
+    project_kwargs (`Dict`, *optional*, defaults to `{}`):
+        Keyword arguments for the accelerator project config (e.g. `logging_dir`).
+    tracker_project_name (`str`, *optional*, defaults to `"trl"`):
+        Name of project to use for tracking.
+    logdir (`str`, *optional*, defaults to `"logs"`):
+        Top-level logging directory for checkpoint saving.
+    num_epochs (`int`, *optional*, defaults to `100`):
+        Number of epochs to train.
+    save_freq (`int`, *optional*, defaults to `1`):
+        Number of epochs between saving model checkpoints.
+    num_checkpoint_limit (`int`, *optional*, defaults to `5`):
+        Number of checkpoints to keep before overwriting old ones.
+    mixed_precision (`str`, *optional*, defaults to `"fp16"`):
+        Mixed precision training.
+    allow_tf32 (`bool`, *optional*, defaults to `True`):
+        Allow `tf32` on Ampere GPUs.
+    resume_from (`str`, *optional*, defaults to `""`):
+        Resume training from a checkpoint.
+    sample_num_steps (`int`, *optional*, defaults to `50`):
+        Number of sampler inference steps.
+    sample_eta (`float`, *optional*, defaults to `1.0`):
+        Eta parameter for the DDIM sampler.
+    sample_guidance_scale (`float`, *optional*, defaults to `5.0`):
+        Classifier-free guidance weight.
+    sample_batch_size (`int`, *optional*, defaults to `1`):
+        Batch size (per GPU) to use for sampling.
+    sample_num_batches_per_epoch (`int`, *optional*, defaults to `2`):
+        Number of batches to sample per epoch.
+    train_batch_size (`int`, *optional*, defaults to `1`):
+        Batch size (per GPU) to use for training.
+    train_use_8bit_adam (`bool`, *optional*, defaults to `False`):
+        Use 8bit Adam optimizer from bitsandbytes.
+    train_learning_rate (`float`, *optional*, defaults to `3e-4`):
+        Learning rate.
+    train_adam_beta1 (`float`, *optional*, defaults to `0.9`):
+        Adam beta1.
+    train_adam_beta2 (`float`, *optional*, defaults to `0.999`):
+        Adam beta2.
+    train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`):
+        Adam weight decay.
+    train_adam_epsilon (`float`, *optional*, defaults to `1e-8`):
+        Adam epsilon.
+    train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`):
+        Number of gradient accumulation steps.
+    train_max_grad_norm (`float`, *optional*, defaults to `1.0`):
+        Maximum gradient norm for gradient clipping.
+    train_num_inner_epochs (`int`, *optional*, defaults to `1`):
+        Number of inner epochs per outer epoch.
+    train_cfg (`bool`, *optional*, defaults to `True`):
+        Whether to use classifier-free guidance during training.
+    train_adv_clip_max (`float`, *optional*, defaults to `5.0`):
+        Clip advantages to the range.
+    train_clip_range (`float`, *optional*, defaults to `1e-4`):
+        PPO clip range.
+    train_timestep_fraction (`float`, *optional*, defaults to `1.0`):
+        Fraction of timesteps to train on.
+    per_prompt_stat_tracking (`bool`, *optional*, defaults to `False`):
+        Whether to track statistics for each prompt separately.
+    per_prompt_stat_tracking_buffer_size (`int`, *optional*, defaults to `16`):
+        Number of reward values to store in the buffer for each prompt.
+    per_prompt_stat_tracking_min_count (`int`, *optional*, defaults to `16`):
+        Minimum number of reward values to store in the buffer.
+    async_reward_computation (`bool`, *optional*, defaults to `False`):
+        Whether to compute rewards asynchronously.
+    max_workers (`int`, *optional*, defaults to `2`):
+        Maximum number of workers to use for async reward computation.
+    negative_prompts (`str`, *optional*, defaults to `""`):
+        Comma-separated list of prompts to use as negative examples.
+    push_to_hub (`bool`, *optional*, defaults to `False`):
+        Whether to push the final model checkpoint to the Hub.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    
+    def __init__(
+        self,
+        exp_name = 'train_grpo',
+        run_name = '',
+        seed = 3407,
+        log_with = None,
+        tracker_project_name = 'trl',
+        logdir = 'logs',
+        num_epochs = 100,
+        save_freq = 1,
+        num_checkpoint_limit = 5,
+        mixed_precision = 'fp16',
+        allow_tf32 = True,
+        resume_from = '',
+        sample_num_steps = 50,
+        sample_eta = 1.0,
+        sample_guidance_scale = 5.0,
+        sample_batch_size = 1,
+        sample_num_batches_per_epoch = 2,
+        train_batch_size = 1,
+        train_use_8bit_adam = False,
+        train_learning_rate = 5e-05,
+        train_adam_beta1 = 0.9,
+        train_adam_beta2 = 0.999,
+        train_adam_weight_decay = 0.01,
+        train_adam_epsilon = 1e-08,
+        train_gradient_accumulation_steps = 2,
+        train_max_grad_norm = 1.0,
+        train_num_inner_epochs = 1,
+        train_cfg = True,
+        train_adv_clip_max = 5.0,
+        train_clip_range = 0.0001,
+        train_timestep_fraction = 1.0,
+        per_prompt_stat_tracking = False,
+        per_prompt_stat_tracking_buffer_size = 16,
+        per_prompt_stat_tracking_min_count = 16,
+        async_reward_computation = False,
+        max_workers = 2,
+        negative_prompts = '',
+        push_to_hub = False,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        
+        super().__init__(
+            exp_name = exp_name,
+            run_name = run_name,
+            seed = seed,
+            log_with = log_with,
+            tracker_project_name = tracker_project_name,
+            logdir = logdir,
+            num_epochs = num_epochs,
+            save_freq = save_freq,
+            num_checkpoint_limit = num_checkpoint_limit,
+            mixed_precision = mixed_precision,
+            allow_tf32 = allow_tf32,
+            resume_from = resume_from,
+            sample_num_steps = sample_num_steps,
+            sample_eta = sample_eta,
+            sample_guidance_scale = sample_guidance_scale,
+            sample_batch_size = sample_batch_size,
+            sample_num_batches_per_epoch = sample_num_batches_per_epoch,
+            train_batch_size = train_batch_size,
+            train_use_8bit_adam = train_use_8bit_adam,
+            train_learning_rate = train_learning_rate,
+            train_adam_beta1 = train_adam_beta1,
+            train_adam_beta2 = train_adam_beta2,
+            train_adam_weight_decay = train_adam_weight_decay,
+            train_adam_epsilon = train_adam_epsilon,
+            train_gradient_accumulation_steps = train_gradient_accumulation_steps,
+            train_max_grad_norm = train_max_grad_norm,
+            train_num_inner_epochs = train_num_inner_epochs,
+            train_cfg = train_cfg,
+            train_adv_clip_max = train_adv_clip_max,
+            train_clip_range = train_clip_range,
+            train_timestep_fraction = train_timestep_fraction,
+            per_prompt_stat_tracking = per_prompt_stat_tracking,
+            per_prompt_stat_tracking_buffer_size = per_prompt_stat_tracking_buffer_size,
+            per_prompt_stat_tracking_min_count = per_prompt_stat_tracking_min_count,
+            async_reward_computation = async_reward_computation,
+            max_workers = max_workers,
+            negative_prompts = negative_prompts,
+            push_to_hub = push_to_hub,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        
+pass
+
+class _UnslothDDPOTrainer(PyTorchModelHubMixin):
+    """
+    The DDPOTrainer uses Deep Diffusion Policy Optimization to optimise diffusion models. Note, this trainer is heavily
+    inspired by the work here: https://github.com/kvablack/ddpo-pytorch As of now only Stable Diffusion based pipelines
+    are supported
+
+    Args:
+        config ([`DDPOConfig`]):
+            Configuration object for DDPOTrainer. Check the documentation of [`PPOConfig`] for more details.
+        reward_function (`Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor]`):
+            Reward function to be used.
+        prompt_function (`Callable[[], tuple[str, Any]]`): Function to generate prompts to guide model
+        sd_pipeline ([`DDPOStableDiffusionPipeline`]): Stable Diffusion pipeline to be used for training.
+        image_samples_hook (`Optional[Callable[[Any, Any, Any], Any]]`): Hook to be called to log images.
+    """
+
+    _tag_names = ["trl", "ddpo"]
+
+    def __init__(
+        self,
+        config: DDPOConfig,
+        reward_function: Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor],
+        prompt_function: Callable[[], tuple[str, Any]],
+        sd_pipeline: DDPOStableDiffusionPipeline,
+        image_samples_hook: Optional[Callable[[Any, Any, Any], Any]] = None,
+    ):
+        warnings.warn(
+            "DDPOTrainer is deprecated and will be removed in version 0.23.0.",
+            DeprecationWarning,
+        )
+        if image_samples_hook is None:
+            logger.warning("No image_samples_hook provided; no images will be logged")
+
+        self.prompt_fn = prompt_function
+        self.reward_fn = reward_function
+        self.config = config
+        self.image_samples_callback = image_samples_hook
+
+        accelerator_project_config = ProjectConfiguration(**self.config.project_kwargs)
+
+        if self.config.resume_from:
+            self.config.resume_from = os.path.normpath(os.path.expanduser(self.config.resume_from))
+            if "checkpoint_" not in os.path.basename(self.config.resume_from):
+                # get the most recent checkpoint in this directory
+                checkpoints = list(
+                    filter(
+                        lambda x: "checkpoint_" in x,
+                        os.listdir(self.config.resume_from),
+                    )
+                )
+                if len(checkpoints) == 0:
+                    raise ValueError(f"No checkpoints found in {self.config.resume_from}")
+                checkpoint_numbers = sorted([int(x.split("_")[-1]) for x in checkpoints])
+                self.config.resume_from = os.path.join(
+                    self.config.resume_from,
+                    f"checkpoint_{checkpoint_numbers[-1]}",
+                )
+
+                accelerator_project_config.iteration = checkpoint_numbers[-1] + 1
+
+        # number of timesteps within each trajectory to train on
+        self.num_train_timesteps = int(self.config.sample_num_steps * self.config.train_timestep_fraction)
+
+        self.accelerator = Accelerator(
+            log_with=self.config.log_with,
+            mixed_precision=self.config.mixed_precision,
+            project_config=accelerator_project_config,
+            # we always accumulate gradients across timesteps; we want config.train.gradient_accumulation_steps to be the
+            # number of *samples* we accumulate across, so we need to multiply by the number of training timesteps to get
+            # the total number of optimizer steps to accumulate across.
+            gradient_accumulation_steps=self.config.train_gradient_accumulation_steps * self.num_train_timesteps,
+            **self.config.accelerator_kwargs,
+        )
+
+        is_okay, message = self._config_check()
+        if not is_okay:
+            raise ValueError(message)
+
+        is_using_tensorboard = config.log_with is not None and config.log_with == "tensorboard"
+
+        if self.accelerator.is_main_process:
+            self.accelerator.init_trackers(
+                self.config.tracker_project_name,
+                config=dict(ddpo_trainer_config=config.to_dict()) if not is_using_tensorboard else config.to_dict(),
+                init_kwargs=self.config.tracker_kwargs,
+            )
+
+        logger.info(f"\n{config}")
+
+        set_seed(self.config.seed, device_specific=True)
+
+        self.sd_pipeline = sd_pipeline
+
+        self.sd_pipeline.set_progress_bar_config(
+            position=1,
+            disable=not self.accelerator.is_local_main_process,
+            leave=False,
+            desc="Timestep",
+            dynamic_ncols=True,
+        )
+
+        # For mixed precision training we cast all non-trainable weights [vae, non-lora text_encoder and non-lora unet] to half-precision
+        # as these weights are only used for inference, keeping weights in full precision is not required.
+        if self.accelerator.mixed_precision == "fp16":
+            inference_dtype = torch.float16
+        elif self.accelerator.mixed_precision == "bf16":
+            inference_dtype = torch.bfloat16
+        else:
+            inference_dtype = torch.float32
+
+        self.sd_pipeline.vae.to(self.accelerator.device, dtype=inference_dtype)
+        self.sd_pipeline.text_encoder.to(self.accelerator.device, dtype=inference_dtype)
+        self.sd_pipeline.unet.to(self.accelerator.device, dtype=inference_dtype)
+
+        trainable_layers = self.sd_pipeline.get_trainable_layers()
+
+        self.accelerator.register_save_state_pre_hook(self._save_model_hook)
+        self.accelerator.register_load_state_pre_hook(self._load_model_hook)
+
+        # Enable TF32 for faster training on Ampere GPUs,
+        # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+        if self.config.allow_tf32 and torch.cuda.is_available():
+            torch.backends.cuda.matmul.allow_tf32 = True
+
+        self.optimizer = self._setup_optimizer(
+            trainable_layers.parameters() if not isinstance(trainable_layers, list) else trainable_layers
+        )
+
+        self.neg_prompt_embed = self.sd_pipeline.text_encoder(
+            self.sd_pipeline.tokenizer(
+                [""] if self.config.negative_prompts is None else self.config.negative_prompts,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.sd_pipeline.tokenizer.model_max_length,
+            ).input_ids.to(self.accelerator.device)
+        )[0]
+
+        if config.per_prompt_stat_tracking:
+            self.stat_tracker = PerPromptStatTracker(
+                config.per_prompt_stat_tracking_buffer_size,
+                config.per_prompt_stat_tracking_min_count,
+            )
+
+        # NOTE: for some reason, autocast is necessary for non-lora training but for lora training it isn't necessary and it uses
+        # more memory
+        self.autocast = self.sd_pipeline.autocast or self.accelerator.autocast
+
+        if hasattr(self.sd_pipeline, "use_lora") and self.sd_pipeline.use_lora:
+            unet, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer)
+            self.trainable_layers = list(filter(lambda p: p.requires_grad, unet.parameters()))
+        else:
+            self.trainable_layers, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer)
+
+        if self.config.async_reward_computation:
+            self.executor = futures.ThreadPoolExecutor(max_workers=config.max_workers)
+
+        if config.resume_from:
+            logger.info(f"Resuming from {config.resume_from}")
+            self.accelerator.load_state(config.resume_from)
+            self.first_epoch = int(config.resume_from.split("_")[-1]) + 1
+        else:
+            self.first_epoch = 0
+
+    def compute_rewards(self, prompt_image_pairs, is_async=False):
+        if not is_async:
+            rewards = []
+            for images, prompts, prompt_metadata in prompt_image_pairs:
+                reward, reward_metadata = self.reward_fn(images, prompts, prompt_metadata)
+                rewards.append(
+                    (
+                        torch.as_tensor(reward, device=self.accelerator.device),
+                        reward_metadata,
+                    )
+                )
+        else:
+            rewards = self.executor.map(lambda x: self.reward_fn(*x), prompt_image_pairs)
+            rewards = [
+                (torch.as_tensor(reward.result(), device=self.accelerator.device), reward_metadata.result())
+                for reward, reward_metadata in rewards
+            ]
+
+        return zip(*rewards)
+
+    def step(self, epoch: int, global_step: int):
+        """
+        Perform a single step of training.
+
+        Args:
+            epoch (int): The current epoch.
+            global_step (int): The current global step.
+
+        Side Effects:
+            - Model weights are updated
+            - Logs the statistics to the accelerator trackers.
+            - If `self.image_samples_callback` is not None, it will be called with the prompt_image_pairs, global_step,
+              and the accelerator tracker.
+
+        Returns:
+            global_step (int): The updated global step.
+
+        """
+        samples, prompt_image_data = self._generate_samples(
+            iterations=self.config.sample_num_batches_per_epoch,
+            batch_size=self.config.sample_batch_size,
+        )
+
+        # collate samples into dict where each entry has shape (num_batches_per_epoch * sample.batch_size, ...)
+        samples = {k: torch.cat([s[k] for s in samples]) for k in samples[0].keys()}
+        rewards, rewards_metadata = self.compute_rewards(
+            prompt_image_data, is_async=self.config.async_reward_computation
+        )
+
+        for i, image_data in enumerate(prompt_image_data):
+            image_data.extend([rewards[i], rewards_metadata[i]])
+
+        if self.image_samples_callback is not None:
+            self.image_samples_callback(prompt_image_data, global_step, self.accelerator.trackers[0])
+
+        rewards = torch.cat(rewards)
+        rewards = self.accelerator.gather(rewards).cpu().numpy()
+
+        self.accelerator.log(
+            {
+                "reward": rewards,
+                "epoch": epoch,
+                "reward_mean": rewards.mean(),
+                "reward_std": rewards.std(),
+            },
+            step=global_step,
+        )
+
+        if self.config.per_prompt_stat_tracking:
+            # gather the prompts across processes
+            prompt_ids = self.accelerator.gather(samples["prompt_ids"]).cpu().numpy()
+            prompts = self.sd_pipeline.tokenizer.batch_decode(prompt_ids, skip_special_tokens=True)
+            advantages = self.stat_tracker.update(prompts, rewards)
+        else:
+            advantages = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
+
+        # ungather advantages;  keep the entries corresponding to the samples on this process
+        samples["advantages"] = (
+            torch.as_tensor(advantages)
+            .reshape(self.accelerator.num_processes, -1)[self.accelerator.process_index]
+            .to(self.accelerator.device)
+        )
+
+        del samples["prompt_ids"]
+
+        total_batch_size, num_timesteps = samples["timesteps"].shape
+
+        for inner_epoch in range(self.config.train_num_inner_epochs):
+            # shuffle samples along batch dimension
+            perm = torch.randperm(total_batch_size, device=self.accelerator.device)
+            samples = {k: v[perm] for k, v in samples.items()}
+
+            # shuffle along time dimension independently for each sample
+            # still trying to understand the code below
+            perms = torch.stack(
+                [torch.randperm(num_timesteps, device=self.accelerator.device) for _ in range(total_batch_size)]
+            )
+
+            for key in ["timesteps", "latents", "next_latents", "log_probs"]:
+                samples[key] = samples[key][
+                    torch.arange(total_batch_size, device=self.accelerator.device)[:, None],
+                    perms,
+                ]
+
+            original_keys = samples.keys()
+            original_values = samples.values()
+            # rebatch them as user defined train_batch_size is different from sample_batch_size
+            reshaped_values = [v.reshape(-1, self.config.train_batch_size, *v.shape[1:]) for v in original_values]
+
+            # Transpose the list of original values
+            transposed_values = zip(*reshaped_values)
+            # Create new dictionaries for each row of transposed values
+            samples_batched = [dict(zip(original_keys, row_values)) for row_values in transposed_values]
+
+            self.sd_pipeline.unet.train()
+            global_step = self._train_batched_samples(inner_epoch, epoch, global_step, samples_batched)
+            # ensure optimization step at the end of the inner epoch
+            if not self.accelerator.sync_gradients:
+                raise ValueError(
+                    "Optimization step should have been performed by this point. Please check calculated gradient accumulation settings."
+                )
+
+        if epoch != 0 and epoch % self.config.save_freq == 0 and self.accelerator.is_main_process:
+            self.accelerator.save_state()
+
+        return global_step
+
+    def calculate_loss(self, latents, timesteps, next_latents, log_probs, advantages, embeds):
+        """
+        Calculate the loss for a batch of an unpacked sample
+
+        Args:
+            latents (torch.Tensor):
+                The latents sampled from the diffusion model, shape: [batch_size, num_channels_latents, height, width]
+            timesteps (torch.Tensor):
+                The timesteps sampled from the diffusion model, shape: [batch_size]
+            next_latents (torch.Tensor):
+                The next latents sampled from the diffusion model, shape: [batch_size, num_channels_latents, height,
+                width]
+            log_probs (torch.Tensor):
+                The log probabilities of the latents, shape: [batch_size]
+            advantages (torch.Tensor):
+                The advantages of the latents, shape: [batch_size]
+            embeds (torch.Tensor):
+                The embeddings of the prompts, shape: [2*batch_size or batch_size, ...] Note: the "or" is because if
+                train_cfg is True, the expectation is that negative prompts are concatenated to the embeds
+
+        Returns:
+            loss (torch.Tensor), approx_kl (torch.Tensor), clipfrac (torch.Tensor) (all of these are of shape (1,))
+        """
+        with self.autocast():
+            if self.config.train_cfg:
+                noise_pred = self.sd_pipeline.unet(
+                    torch.cat([latents] * 2),
+                    torch.cat([timesteps] * 2),
+                    embeds,
+                ).sample
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.config.sample_guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+            else:
+                noise_pred = self.sd_pipeline.unet(
+                    latents,
+                    timesteps,
+                    embeds,
+                ).sample
+            # compute the log prob of next_latents given latents under the current model
+
+            scheduler_step_output = self.sd_pipeline.scheduler_step(
+                noise_pred,
+                timesteps,
+                latents,
+                eta=self.config.sample_eta,
+                prev_sample=next_latents,
+            )
+
+            log_prob = scheduler_step_output.log_probs
+
+        advantages = torch.clamp(
+            advantages,
+            -self.config.train_adv_clip_max,
+            self.config.train_adv_clip_max,
+        )
+
+        ratio = torch.exp(log_prob - log_probs)
+
+        loss = self.loss(advantages, self.config.train_clip_range, ratio)
+
+        approx_kl = 0.5 * torch.mean((log_prob - log_probs) ** 2)
+
+        clipfrac = torch.mean((torch.abs(ratio - 1.0) > self.config.train_clip_range).float())
+
+        return loss, approx_kl, clipfrac
+
+    def loss(
+        self,
+        advantages: torch.Tensor,
+        clip_range: float,
+        ratio: torch.Tensor,
+    ):
+        unclipped_loss = -advantages * ratio
+        clipped_loss = -advantages * torch.clamp(
+            ratio,
+            1.0 - clip_range,
+            1.0 + clip_range,
+        )
+        return torch.mean(torch.maximum(unclipped_loss, clipped_loss))
+
+    def _setup_optimizer(self, trainable_layers_parameters):
+        if self.config.train_use_8bit_adam:
+            import bitsandbytes
+
+            optimizer_cls = bitsandbytes.optim.AdamW8bit
+        else:
+            optimizer_cls = torch.optim.AdamW
+
+        return optimizer_cls(
+            trainable_layers_parameters,
+            lr=self.config.train_learning_rate,
+            betas=(self.config.train_adam_beta1, self.config.train_adam_beta2),
+            weight_decay=self.config.train_adam_weight_decay,
+            eps=self.config.train_adam_epsilon,
+        )
+
+    def _save_model_hook(self, models, weights, output_dir):
+        self.sd_pipeline.save_checkpoint(models, weights, output_dir)
+        weights.pop()  # ensures that accelerate doesn't try to handle saving of the model
+
+    def _load_model_hook(self, models, input_dir):
+        self.sd_pipeline.load_checkpoint(models, input_dir)
+        models.pop()  # ensures that accelerate doesn't try to handle loading of the model
+
+    def _generate_samples(self, iterations, batch_size):
+        """
+        Generate samples from the model
+
+        Args:
+            iterations (int): Number of iterations to generate samples for
+            batch_size (int): Batch size to use for sampling
+
+        Returns:
+            samples (list[dict[str, torch.Tensor]]), prompt_image_pairs (list[list[Any]])
+        """
+        samples = []
+        prompt_image_pairs = []
+        self.sd_pipeline.unet.eval()
+
+        sample_neg_prompt_embeds = self.neg_prompt_embed.repeat(batch_size, 1, 1)
+
+        for _ in range(iterations):
+            prompts, prompt_metadata = zip(*[self.prompt_fn() for _ in range(batch_size)])
+
+            prompt_ids = self.sd_pipeline.tokenizer(
+                prompts,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.sd_pipeline.tokenizer.model_max_length,
+            ).input_ids.to(self.accelerator.device)
+            prompt_embeds = self.sd_pipeline.text_encoder(prompt_ids)[0]
+
+            with self.autocast():
+                sd_output = self.sd_pipeline(
+                    prompt_embeds=prompt_embeds,
+                    negative_prompt_embeds=sample_neg_prompt_embeds,
+                    num_inference_steps=self.config.sample_num_steps,
+                    guidance_scale=self.config.sample_guidance_scale,
+                    eta=self.config.sample_eta,
+                    output_type="pt",
+                )
+
+                images = sd_output.images
+                latents = sd_output.latents
+                log_probs = sd_output.log_probs
+
+            latents = torch.stack(latents, dim=1)  # (batch_size, num_steps + 1, ...)
+            log_probs = torch.stack(log_probs, dim=1)  # (batch_size, num_steps, 1)
+            timesteps = self.sd_pipeline.scheduler.timesteps.repeat(batch_size, 1)  # (batch_size, num_steps)
+
+            samples.append(
+                {
+                    "prompt_ids": prompt_ids,
+                    "prompt_embeds": prompt_embeds,
+                    "timesteps": timesteps,
+                    "latents": latents[:, :-1],  # each entry is the latent before timestep t
+                    "next_latents": latents[:, 1:],  # each entry is the latent after timestep t
+                    "log_probs": log_probs,
+                    "negative_prompt_embeds": sample_neg_prompt_embeds,
+                }
+            )
+            prompt_image_pairs.append([images, prompts, prompt_metadata])
+
+        return samples, prompt_image_pairs
+
+    def _train_batched_samples(self, inner_epoch, epoch, global_step, batched_samples):
+        """
+        Train on a batch of samples. Main training segment
+
+        Args:
+            inner_epoch (int): The current inner epoch
+            epoch (int): The current epoch
+            global_step (int): The current global step
+            batched_samples (list[dict[str, torch.Tensor]]): The batched samples to train on
+
+        Side Effects:
+            - Model weights are updated
+            - Logs the statistics to the accelerator trackers.
+
+        Returns:
+            global_step (int): The updated global step
+        """
+        info = defaultdict(list)
+        for _i, sample in enumerate(batched_samples):
+            if self.config.train_cfg:
+                # concat negative prompts to sample prompts to avoid two forward passes
+                embeds = torch.cat([sample["negative_prompt_embeds"], sample["prompt_embeds"]])
+            else:
+                embeds = sample["prompt_embeds"]
+
+            for j in range(self.num_train_timesteps):
+                with self.accelerator.accumulate(self.sd_pipeline.unet):
+                    loss, approx_kl, clipfrac = self.calculate_loss(
+                        sample["latents"][:, j],
+                        sample["timesteps"][:, j],
+                        sample["next_latents"][:, j],
+                        sample["log_probs"][:, j],
+                        sample["advantages"],
+                        embeds,
+                    )
+                    info["approx_kl"].append(approx_kl)
+                    info["clipfrac"].append(clipfrac)
+                    info["loss"].append(loss)
+
+                    self.accelerator.backward(loss)
+                    if self.accelerator.sync_gradients:
+                        self.accelerator.clip_grad_norm_(
+                            self.trainable_layers.parameters()
+                            if not isinstance(self.trainable_layers, list)
+                            else self.trainable_layers,
+                            self.config.train_max_grad_norm,
+                        )
+                    self.optimizer.step()
+                    self.optimizer.zero_grad()
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if self.accelerator.sync_gradients:
+                    # log training-related stuff
+                    info = {k: torch.mean(torch.stack(v)) for k, v in info.items()}
+                    info = self.accelerator.reduce(info, reduction="mean")
+                    info.update({"epoch": epoch, "inner_epoch": inner_epoch})
+                    self.accelerator.log(info, step=global_step)
+                    global_step += 1
+                    info = defaultdict(list)
+        return global_step
+
+    def _config_check(self) -> tuple[bool, str]:
+        samples_per_epoch = (
+            self.config.sample_batch_size * self.accelerator.num_processes * self.config.sample_num_batches_per_epoch
+        )
+        total_train_batch_size = (
+            self.config.train_batch_size
+            * self.accelerator.num_processes
+            * self.config.train_gradient_accumulation_steps
+        )
+
+        if not self.config.sample_batch_size >= self.config.train_batch_size:
+            return (
+                False,
+                f"Sample batch size ({self.config.sample_batch_size}) must be greater than or equal to the train batch size ({self.config.train_batch_size})",
+            )
+        if not self.config.sample_batch_size % self.config.train_batch_size == 0:
+            return (
+                False,
+                f"Sample batch size ({self.config.sample_batch_size}) must be divisible by the train batch size ({self.config.train_batch_size})",
+            )
+        if not samples_per_epoch % total_train_batch_size == 0:
+            return (
+                False,
+                f"Number of samples per epoch ({samples_per_epoch}) must be divisible by the total train batch size ({total_train_batch_size})",
+            )
+        return True, ""
+
+    def train(self, epochs: Optional[int] = None):
+        """
+        Train the model for a given number of epochs
+        """
+        global_step = 0
+        if epochs is None:
+            epochs = self.config.num_epochs
+        for epoch in range(self.first_epoch, epochs):
+            global_step = self.step(epoch, global_step)
+
+    def _save_pretrained(self, save_directory):
+        self.sd_pipeline.save_pretrained(save_directory)
+        self.create_model_card()
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @inproceedings{black2024training,
+            title        = {{Training Diffusion Models with Reinforcement Learning}},
+            author       = {Kevin Black and Michael Janner and Yilun Du and Ilya Kostrikov and Sergey Levine},
+            year         = 2024,
+            booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
+            publisher    = {OpenReview.net},
+            url          = {https://openreview.net/forum?id=YCWjhGrJFD},
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="DDPO",
+            trainer_citation=citation,
+            paper_title="Training Diffusion Models with Reinforcement Learning",
+            paper_id="2305.13301",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothDDPOTrainer(_UnslothDDPOTrainer):
+    """
+    
+The DDPOTrainer uses Deep Diffusion Policy Optimization to optimise diffusion models. Note, this trainer is heavily
+inspired by the work here: https://github.com/kvablack/ddpo-pytorch As of now only Stable Diffusion based pipelines
+are supported
+
+Args:
+    config ([`DDPOConfig`]):
+        Configuration object for DDPOTrainer. Check the documentation of [`PPOConfig`] for more details.
+    reward_function (`Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor]`):
+        Reward function to be used.
+    prompt_function (`Callable[[], tuple[str, Any]]`): Function to generate prompts to guide model
+    sd_pipeline ([`DDPOStableDiffusionPipeline`]): Stable Diffusion pipeline to be used for training.
+    image_samples_hook (`Optional[Callable[[Any, Any, Any], Any]]`): Hook to be called to log images.
+
+    """
+    def __init__(
+        self,
+        config,
+        reward_function,
+        prompt_function,
+        sd_pipeline,
+        image_samples_hook = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothDDPOConfig()
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('ddpo_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            config = config,
+            reward_function = reward_function,
+            prompt_function = prompt_function,
+            sd_pipeline = sd_pipeline,
+            image_samples_hook = image_samples_hook,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothDPOTrainer.py b/unsloth_compiled_cache/UnslothDPOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..330d0c2097e5e078dfec9ae8b5665697f9f73be4
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothDPOTrainer.py
@@ -0,0 +1,2796 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.dpo_trainer import (Any, AutoModelForCausalLM, AutoTokenizer, BaseImageProcessor, Callable, DPOConfig, DPOTrainer, DataCollator, DataCollatorForPreference, DataLoader, Dataset, EvalLoopOutput, F, FDivergenceConstants, FDivergenceType, FeatureExtractionMixin, IterableDataset, Literal, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, Optional, PartialState, Path, PeftConfig, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RunningMoments, SyncRefModelCallback, Trainer, TrainerCallback, Union, autocast, cap_exp, contextmanager, create_reference_model, dataclass, defaultdict, disable_dropout_in_model, empty_cache, flush_left, flush_right, generate_model_card, get_comet_experiment_url, get_peft_model, inspect, is_comet_available, is_liger_kernel_available, is_mlflow_available, is_peft_available, is_wandb_available, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, nn, nullcontext, os, pad, pad_to_length, pd, peft_module_casting_to_bf16, prepare_deepspeed, prepare_fsdp, prepare_model_for_kbit_training, random, selective_log_softmax, shift_tokens_right, textwrap, torch, tqdm, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, logger, os, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothDPOConfig(DPOConfig):
+    """
+    
+Configuration class for the [`DPOTrainer`].
+
+This class includes only the parameters that are specific to DPO training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    > Parameters that control the model and reference model
+
+    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `model` argument of the
+        [`DPOTrainer`] is provided as a string.
+    ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `ref_model` argument of the
+        [`DPOTrainer`] is provided as a string.
+    model_adapter_name (`str` or `None`, *optional*, defaults to `None`):
+        Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+    ref_adapter_name (`str` or `None`, *optional*, defaults to `None`):
+        Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+    force_use_ref_model (`bool`, *optional*, defaults to `False`):
+        If you provide a PEFT model as the active model and wish to use a different model for the `ref_model`, set
+        this flag to `True`.
+    disable_dropout (`bool`, *optional*, defaults to `True`):
+        Whether to disable dropout in the model and reference model.
+    use_logits_to_keep (`bool`, *optional*, defaults to `False`):
+        If `True`, only a specified number of logits are computed in the forward pass. This can be useful for
+        saving memory and speeding up training by not computing the logits for all tokens, especially in scenarios
+        when working with very long prompts where labels are ignored (-100).
+
+    > Parameters that control the data preprocessing
+
+    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        Number of processes to use for processing the dataset.
+    padding_value (`int` or `None`, *optional*, defaults to `None`):
+        Padding value to use. If `None`, the padding value of the tokenizer is used.
+    label_pad_token_id (`int`, *optional*, defaults to `-100`):
+        Padding value to use for labels.
+    max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+        Maximum length of the prompt.
+    max_completion_length (`int` or `None`, *optional*, defaults to `None`):
+        Maximum length of the completion.
+    max_length (`int` or `None`, *optional*, defaults to `1024`):
+        Maximum length of the full sequence (prompt + completion).
+    truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+        Truncation mode to use when the sequence exceeds `max_length`. Possible values are `"keep_end"` and
+        `"keep_start"`.
+    padding_free (`bool`, *optional*, defaults to `False`):
+        Whether to perform forward passes without padding by flattening all sequences in the batch into a single
+        continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only
+        supported with the `flash_attention_2` attention implementation, which can efficiently handle the flattened
+        batch structure.
+    precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
+        Whether to precompute the log probabilities from the reference model. Setting this to `True` allows
+        training without needing the reference model during training, which can help reduce GPU memory usage. If
+        set to `False` (default), the reference model will be used during training to compute log probabilities
+        on-the-fly.
+    precompute_ref_batch_size (`int` or `None`, *optional*, defaults to `None`):
+        Batch size to use when precomputing reference model log probabilities. This can be set higher than the
+        training batch size to speed up preprocessing. If `None`, defaults to `per_device_train_batch_size` for
+        training and `per_device_eval_batch_size` for evaluation.
+    tools (`Optional[list[Union[dict, Callable]]]`, *optional*, defaults to `None`):
+        List of tools (callable functions) that will be accessible to the model. If the template does not support
+        function calling, this argument will have no effect.
+
+    > Parameters that control the training
+
+    loss_type (`str` or `list[str]`, *optional*, defaults to `"sigmoid"`):
+        Type of loss to use. Possible values are:
+
+            - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+            - `"hinge"`: hinge loss on the normalized likelihood from the
+              [SLiC](https://huggingface.co/papers/2305.10425) paper.
+            - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+            - `"exo_pair"`: pairwise EXO loss from the [EXO](https://huggingface.co/papers/2402.00856) paper.
+            - `"nca_pair"`: pairwise NCA loss from the [NCA](https://huggingface.co/papers/2402.05369) paper.
+            - `"robust"`: unbiased estimate of the DPO loss that is robust to preference noise from the [Robust
+              DPO](https://huggingface.co/papers/2403.00409) paper.
+            - `"bco_pair"`: pairwise BCO loss from the [BCO](https://huggingface.co/papers/2404.04656) paper.
+            - `"sppo_hard"`: SPPO loss with hard label from the [SPPO](https://huggingface.co/papers/2405.00675)
+              paper.
+            - `"aot"`: AOT loss for paired datasets from the [AOT](https://huggingface.co/papers/2406.05882) paper.
+            - `"aot_pair"`: AOT loss for unpaired datasets from the [AOT](https://huggingface.co/papers/2406.05882)
+              paper.
+            - `"discopop"`: DiscoPOP (a.k.a Log-Ratio Modulated Loss, LRML) loss from the
+              [DiscoPOP](https://huggingface.co/papers/2406.08414) paper.
+            - `"apo_zero"`: APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+            - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+            - `"sft"`: Negative log-likelihood loss (standard supervised fine-tuning loss).
+
+        Multiple loss types can be combined using comma separation (e.g., `["sigmoid", "bco_pair", "sft"]` for
+        [MPO](https://huggingface.co/papers/2411.10442)). The `loss_weights` parameter can be used to specify
+        corresponding weights for each loss type.
+
+    use_liger_loss (`bool`, *optional*, defaults to `False`):
+        Whether to use Liger loss.
+    base_model_attribute_name (`str`, *optional*, defaults to `"model"`):
+        Name of the attribute in the model that contains the base model. This is used to get the base model from
+        the model when the model does not have a `get_decoder` method in the case when `use_liger_loss` is `True`.
+    beta (`float`, *optional*, defaults to `0.1`):
+        Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+        reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+        the [paper](https://huggingface.co/papers/2310.12036).
+    f_divergence_type (`str`, *optional*, defaults to `FDivergenceType.REVERSE_KL`):
+        Type of f-divergence regularization function to compute divergence between policy and reference model.
+    f_alpha_divergence_coef (`float`, *optional*, defaults to `1.0`):
+        α coefficient in the α-divergence u^-α regularization function for DPO loss.
+    reference_free (`bool`, *optional*, defaults to `False`):
+        Whether to ignore the provided reference model and implicitly use a reference model that assigns equal
+        probability to all responses.
+    label_smoothing (`float`, *optional*, defaults to `0.0`):
+        Robust DPO label smoothing parameter from the [cDPO report](https://ericmitchell.ai/cdpo.pdf) and [Robust
+        DPO](https://huggingface.co/papers/2403.00409) paper that should be between `0.0` and `0.5`.
+    use_weighting (`bool`, *optional*, defaults to `False`):
+        Whether to weight the loss as done in the [WPO paper](https://huggingface.co/papers/2406.11827).
+    rpo_alpha (`float`, *optional*, defaults to `None`):
+        α parameter from the [RPO paper](https://huggingface.co/papers/2404.19733) (v3), which controls the
+        weighting of the NLL term in the loss. If `None`, no weighting is applied and the loss is the same as the
+        DPO loss. The paper recommends `rpo_alpha=1.0`.
+    ld_alpha (`float` or `None`, *optional*, defaults to `None`):
+        α parameter from the [LD-DPO paper](https://huggingface.co/papers/2409.06411), which controls the weighting
+        of the verbose token log-probabilities in responses. If `None`, no weighting is applied to the verbose
+        part, and the loss is equivalent to the standard DPO loss. The paper recommends setting `ld_alpha` between
+        `0.0` and `1.0`.
+    discopop_tau (`float`, *optional*, defaults to `0.05`):
+        τ/temperature parameter from the [DiscoPOP](https://huggingface.co/papers/2406.08414) paper, which controls
+        the shape of log ratio modulated loss. The paper recommends the default value `discopop_tau=0.05`.
+    loss_weights (`list[float]` or `None`, *optional*, defaults to `None`):
+        List of loss weights for multi-loss combinations. Used when combining multiple loss types. Example: `[0.8,
+        0.2, 1.0]` for [MPO](https://huggingface.co/papers/2411.10442). If not provided, defaults to equal weights
+        (`1.0`) for all loss types.
+    sync_ref_model (`bool`, *optional*, defaults to `False`):
+        Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using
+        the `ref_model_mixup_alpha` parameter. This synchronization originates from the
+        [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
+    ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`):
+        α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix
+        between the current policy and the previous reference policy during updates. The reference policy is
+        updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you
+        must set `sync_ref_model=True`.
+    ref_model_sync_steps (`int`, *optional*, defaults to `512`):
+        τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
+        frequently the current policy is synchronized with the reference policy. To use this parameter, you must
+        set `sync_ref_model=True`.
+
+    > Parameters that control the logging
+
+    generate_during_eval (`bool`, *optional*, defaults to `False`):
+        Whether to generate and log completions from both the model and the reference model to W&B or Comet during
+        evaluation.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        model_init_kwargs = None,
+        ref_model_init_kwargs = None,
+        model_adapter_name = None,
+        ref_adapter_name = None,
+        force_use_ref_model = False,
+        disable_dropout = True,
+        use_logits_to_keep = False,
+        dataset_num_proc = None,
+        padding_value = None,
+        label_pad_token_id = -100,
+        max_prompt_length = 512,
+        max_completion_length = None,
+        max_length = 1024,
+        truncation_mode = 'keep_end',
+        padding_free = False,
+        precompute_ref_log_probs = False,
+        precompute_ref_batch_size = None,
+        tools = None,
+        use_liger_loss = False,
+        base_model_attribute_name = 'model',
+        beta = 0.1,
+        f_alpha_divergence_coef = 1.0,
+        reference_free = False,
+        label_smoothing = 0.0,
+        use_weighting = False,
+        rpo_alpha = None,
+        ld_alpha = None,
+        discopop_tau = 0.05,
+        loss_weights = None,
+        sync_ref_model = False,
+        ref_model_mixup_alpha = 0.6,
+        ref_model_sync_steps = 512,
+        generate_during_eval = False,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            model_init_kwargs = model_init_kwargs,
+            ref_model_init_kwargs = ref_model_init_kwargs,
+            model_adapter_name = model_adapter_name,
+            ref_adapter_name = ref_adapter_name,
+            force_use_ref_model = force_use_ref_model,
+            disable_dropout = disable_dropout,
+            use_logits_to_keep = use_logits_to_keep,
+            dataset_num_proc = dataset_num_proc,
+            padding_value = padding_value,
+            label_pad_token_id = label_pad_token_id,
+            max_prompt_length = max_prompt_length,
+            max_completion_length = max_completion_length,
+            max_length = max_length,
+            truncation_mode = truncation_mode,
+            padding_free = padding_free,
+            precompute_ref_log_probs = precompute_ref_log_probs,
+            precompute_ref_batch_size = precompute_ref_batch_size,
+            tools = tools,
+            use_liger_loss = use_liger_loss,
+            base_model_attribute_name = base_model_attribute_name,
+            beta = beta,
+            f_alpha_divergence_coef = f_alpha_divergence_coef,
+            reference_free = reference_free,
+            label_smoothing = label_smoothing,
+            use_weighting = use_weighting,
+            rpo_alpha = rpo_alpha,
+            ld_alpha = ld_alpha,
+            discopop_tau = discopop_tau,
+            loss_weights = loss_weights,
+            sync_ref_model = sync_ref_model,
+            ref_model_mixup_alpha = ref_model_mixup_alpha,
+            ref_model_sync_steps = ref_model_sync_steps,
+            generate_during_eval = generate_during_eval,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothDPOTrainer(Trainer):
+    """
+    Trainer for Direct Preference Optimization (DPO) method.
+
+    This class is a wrapper around the [`transformers.Trainer`] class and inherits all of its attributes and methods.
+
+    Args:
+        model (`Union[str, PreTrainedModel]`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        ref_model (`PreTrainedModelWrapper`):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        args ([`DPOConfig`], *optional*, defaults to `None`):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        data_collator (`DataCollator`, *optional*):
+            Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
+            Will default to [`DataCollatorForPreference`].
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. DPO supports [preference](#preference) type and. The format of the samples can
+            be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+            Processing class used to process the data. If `None`, the processing class is loaded from the model's name
+            with [`~transformers.AutoTokenizer.from_pretrained`].
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
+            a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to
+            `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered
+            after the last eval batch to signal that the function needs to calculate and return the global summary
+            statistics rather than accumulating the batch-level statistics.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*, defaults to `None`):
+            A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
+            `args`. Incompatible with the `optimizers` argument.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*, defaults to `None`):
+            A function that preprocess the logits right before caching them at each evaluation step. Must take two
+            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
+            by this function will be reflected in the predictions received by `compute_metrics`.
+
+            Note that the labels (second parameter) will be `None` if the dataset does not have them.
+        peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    """
+
+    _tag_names = ["trl", "dpo"]
+
+    def __init__(
+        self,
+        model: Union[str, nn.Module, PreTrainedModel],
+        ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
+        args: Optional[DPOConfig] = None,
+        data_collator: Optional[DataCollator] = None,  # type: ignore
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
+        optimizer_cls_and_kwargs: Optional[tuple[type[torch.optim.Optimizer], dict[str, Any]]] = None,
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional["PeftConfig"] = None,
+    ):
+        # Args
+        model_id = model if isinstance(model, str) else model.config._name_or_path
+        if args is None:
+            model_name = model_id.split("/")[-1]
+            args = DPOConfig(f"{model_name}-DPO")
+
+        # Handle the tokenizer
+        if processing_class is None:
+            processing_class = AutoTokenizer.from_pretrained(model_id)
+
+        if args.padding_value is not None:
+            self.padding_value = args.padding_value
+        else:
+            if hasattr(processing_class, "pad_token_id") and processing_class.pad_token_id is not None:
+                self.padding_value = processing_class.pad_token_id
+            elif hasattr(processing_class, "tokenizer") and processing_class.tokenizer.pad_token_id is not None:
+                self.padding_value = processing_class.tokenizer.pad_token_id
+            else:
+                raise ValueError(
+                    "`padding_value` is not specified in `DPOConfig`, and `pad_token_id` is missing in the "
+                    "`processing_class`. Please either set the `padding_value` argument in `DPOConfig`, or set "
+                    "`tokenizer.pad_token` (e.g., `tokenizer.pad_token = tokenizer.eos_token`) before instantiating "
+                    "the trainer."
+                )
+
+        # Model
+        if not isinstance(model, str) and ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must mass a copy of it, or `None` if you use peft."
+            )
+
+        if args.model_init_kwargs is not None and not isinstance(model, str):
+            logger.warning(
+                "You passed model_init_kwargs to the `DPOConfig`, but your model is already instantiated. "
+                "The `model_init_kwargs` will be ignored."
+            )
+        if isinstance(model, str):
+            model = self._create_model_from_path(model, args)
+
+        if args.ref_model_init_kwargs is not None and not isinstance(ref_model, str):
+            logger.warning(
+                "You passed ref_model_init_kwargs to the `DPOConfig`, but your ref_model is already instantiated. "
+                "The `ref_model_init_kwargs` will be ignored."
+            )
+        if isinstance(ref_model, str):
+            ref_model = self._create_model_from_path(ref_model, args, is_ref=True)
+
+        # PEFT configuration and model wrapping
+        model = self._prepare_peft_model(model, ref_model, peft_config, args)
+
+        if args.generate_during_eval and not (is_wandb_available() or is_comet_available() or is_mlflow_available()):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases, MLFlow or Comet to be installed."
+                " Please install `wandb`, `mlflow` or `comet-ml` to resolve."
+            )
+
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.is_vision_model = model.config.model_type in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.keys()
+        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+        self.model_adapter_name = args.model_adapter_name
+        self.ref_adapter_name = args.ref_adapter_name
+        self.reference_free = args.reference_free
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model or args.precompute_ref_log_probs:
+            # The `model` with adapters turned off will be used as the reference model
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(model)
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        # Liger kernel
+        if args.use_liger_loss:
+            if not is_liger_kernel_available():
+                raise ImportError(
+                    "You set `use_liger_loss=True` but the liger kernel is not available. "
+                    "Please install liger-kernel first: `pip install liger-kernel`"
+                )
+            if args.loss_type not in ["sigmoid", "apo_zero", "apo_down", "sppo_hard", "nca_pair"]:
+                raise ValueError(
+                    "You set `use_liger_loss=True` but the loss type is not from `[sigmoid, apo_zero, apo_down, sppo_hard, nca_pair`. "
+                    "Please set `loss_type='[sigmoid | apo_zero | apo_down | sppo_hard | nca_pair]'` to use the liger kernel."
+                )
+            self.dpo_loss_fn = LigerFusedLinearDPOLoss(
+                ignore_index=args.label_pad_token_id,
+                beta=args.beta,
+                use_ref_model=not args.reference_free,
+                average_log_prob=False,
+                loss_type=args.loss_type,
+            )
+        # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in DPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
+        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Data collator
+        if data_collator is None:
+            data_collator = DataCollatorForPreference(pad_token_id=self.padding_value)
+
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = args.max_completion_length
+        self.max_length = args.max_length
+        self.truncation_mode = args.truncation_mode
+        self.precompute_ref_log_probs = args.precompute_ref_log_probs
+        self.use_logits_to_keep = args.use_logits_to_keep
+
+        if args.padding_free:
+            if model.config._attn_implementation != "flash_attention_2":
+                logger.warning(
+                    "Padding-free training is enabled, but the attention implementation is not set to "
+                    "'flash_attention_2'. Padding-free training flattens batches into a single sequence, and "
+                    "'flash_attention_2' is the only known attention mechanism that reliably supports this. Using "
+                    "other implementations may lead to unexpected behavior. To ensure compatibility, set "
+                    "`attn_implementation='flash_attention_2'` in the model configuration, or verify that your "
+                    "attention mechanism can handle flattened sequences."
+                )
+            if args.per_device_train_batch_size == 1:
+                logger.warning(
+                    "You are using a per_device_train_batch_size of 1 with padding-free training. Using a batch size "
+                    "of 1 anihilate the benefits of padding-free training. Please consider increasing the batch size "
+                    "to at least 2."
+                )
+        self.padding_free = args.padding_free
+
+        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
+        # keep track of first called to avoid computation of future calls
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+
+        self.beta = args.beta
+        self.label_smoothing = args.label_smoothing
+        self.loss_type = args.loss_type if isinstance(args.loss_type, list) else [args.loss_type]
+        self.loss_weights = args.loss_weights
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.use_weighting = args.use_weighting
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+        for loss_type in self.loss_type:
+            if (
+                loss_type in ["hinge", "ipo", "bco_pair", "sppo_hard", "nca_pair", "apo_zero", "apo_down"]
+                and args.label_smoothing > 0
+            ):
+                logger.warning(
+                    f"You are using the {loss_type} loss type that does not support label smoothing. The "
+                    "`label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this "
+                    "warning.",
+                )
+            if loss_type == "kto_pair":
+                raise ValueError("Support for kto_pair has been removed in DPOTrainer. Please use KTOTrainer.")
+
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+        self.f_divergence_type = args.f_divergence_type
+        self.f_divergence_params = {FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY: args.f_alpha_divergence_coef}
+        self.dataset_num_proc = args.dataset_num_proc
+
+        # Dataset preparation
+        train_dataset = self._prepare_dataset(train_dataset, processing_class, args, "train")
+        if eval_dataset is not None:
+            if isinstance(eval_dataset, dict):
+                eval_dataset = {
+                    key: self._prepare_dataset(dataset, processing_class, args, key)
+                    for key, dataset in eval_dataset.items()
+                }
+            else:
+                eval_dataset = self._prepare_dataset(eval_dataset, processing_class, args, "eval")
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+        # Deepspeed Zero-3 does not support precompute_ref_log_probs
+        if self.is_deepspeed_enabled:
+            if self.accelerator.state.deepspeed_plugin.zero_stage == 3 and self.precompute_ref_log_probs:
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`."
+                )
+
+        if self.ref_model is None:
+            if not (self.is_peft_model or self.precompute_ref_log_probs):
+                raise ValueError(
+                    "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`"
+                )
+            if args.sync_ref_model:
+                raise ValueError(
+                    "You currently cannot use `ref_model=None` with TR-DPO method. Please provide `ref_model`."
+                )
+        else:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            elif self.is_fsdp_enabled:
+                self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+
+        if args.sync_ref_model:
+            if self.precompute_ref_log_probs:
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with TR-DPO method. Please set `precompute_ref_log_probs=False`."
+                )
+
+            self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator))
+
+        if "bco_pair" in self.loss_type:
+            self.running = RunningMoments(self.accelerator)
+
+    def _create_model_from_path(self, model_path: str, args: DPOConfig, is_ref: bool = False) -> PreTrainedModel:
+        """Creates a model from a path or model identifier."""
+        if not is_ref:
+            model_init_kwargs = args.model_init_kwargs or {}
+        else:
+            model_init_kwargs = args.ref_model_init_kwargs or {}
+
+        # Handle torch dtype
+        dtype = model_init_kwargs.get("dtype")
+        if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+            pass  # dtype is already a torch.dtype or "auto" or None
+        elif isinstance(dtype, str):  # it's a str, but not "auto"
+            dtype = getattr(torch, dtype)
+            model_init_kwargs["dtype"] = dtype
+        else:
+            raise ValueError(
+                "Invalid `dtype` passed to `DPOConfig`. Expected either 'auto' or a string representing "
+                f"a `torch.dtype` (e.g., 'float32'), but got {dtype}."
+            )
+
+        # Create model
+        model = AutoModelForCausalLM.from_pretrained(model_path, **model_init_kwargs)
+        return model
+
+    def _prepare_peft_model(
+        self, model: PreTrainedModel, ref_model: PreTrainedModel, peft_config: Any, args: DPOConfig
+    ) -> PreTrainedModel:
+        """Prepares a model for PEFT training."""
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if ref_model is not None and not args.force_use_ref_model:
+                raise ValueError(
+                    "You passed both a ref_model and a peft_config. For training PEFT adapters with DPO there is no need to pass a reference"
+                    " model. Please pass `ref_model=None` in case you want to train PEFT adapters, or pass a ref_model with `force_use_ref_model=True` in DPOTrainer's init."
+                    " if you want to use a different ref_model."
+                )
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+
+            else:
+                model = self._prepare_gradient_checkpointing(model, args)
+
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        else:
+            model = self._prepare_gradient_checkpointing(model, args)
+
+        return model
+
+    def _prepare_gradient_checkpointing(self, model: PreTrainedModel, args: DPOConfig):
+        """Prepare the gradienting checkpointing for the model."""
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        if args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        return model
+
+    def _prepare_dataset(
+        self,
+        dataset: Union[Dataset, IterableDataset],
+        processing_class: Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin],
+        args: DPOConfig,
+        dataset_name: str,
+    ) -> Union[Dataset, IterableDataset]:
+        # Build the kwargs for the `map` function
+        map_kwargs = {}
+        if isinstance(dataset, Dataset):  # IterableDataset does not support num_proc nor writer_batch_size
+            map_kwargs["num_proc"] = args.dataset_num_proc
+            map_kwargs["writer_batch_size"] = 10
+
+        with PartialState().main_process_first():
+            # Extract prompt if needed
+            if isinstance(dataset, Dataset):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Extracting prompt in {dataset_name} dataset"
+            dataset = dataset.map(maybe_extract_prompt, **map_kwargs)
+
+            # Apply the chat template if needed
+            if isinstance(dataset, Dataset):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Applying chat template to {dataset_name} dataset"
+            dataset = dataset.map(
+                maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class, "tools": args.tools}, **map_kwargs
+            )
+
+            # Tokenize the dataset
+            if isinstance(dataset, Dataset):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
+
+            dataset = dataset.map(
+                self.tokenize_row if not self.is_vision_model else self.process_row,
+                remove_columns=["chosen", "rejected"],
+                fn_kwargs={
+                    "processing_class": processing_class,
+                    "max_prompt_length": args.max_prompt_length,
+                    "max_completion_length": args.max_completion_length,
+                    # for enc-dec, we add the special tokens ([bos_token] + prompt + [eos_token]; completion + [eos_token])
+                    "add_special_tokens": False,
+                },
+                **map_kwargs,
+            )
+
+        return dataset
+
+    @staticmethod
+    def tokenize_row(
+        features: dict[str, str],
+        processing_class: PreTrainedTokenizerBase,
+        max_prompt_length: Optional[int] = None,
+        max_completion_length: Optional[int] = None,
+        add_special_tokens: bool = True,
+    ) -> dict[str, list[int]]:
+        """
+        Tokenize a row of the dataset.
+
+        Args:
+            features (`dict[str, str]`):
+                Row of the dataset, should contain the keys `"prompt"`, `"chosen"`, and `"rejected"`.
+            processing_class (`PreTrainedTokenizerBase`):
+                Processing class used to process the data.
+            max_prompt_length (`int` or `None`):
+                Maximum length of the prompt sequence. If `None`, the prompt sequence is not truncated.
+            max_completion_length (`int` or `None`):
+                Maximum length of the completion sequences. If `None`, the completion sequences are not truncated.
+            add_special_tokens (`bool`):
+                Whether to add special tokens to the sequences. Typically used for encoder-decoder models. If `True`,
+                the prompt sequence will have a bos token prepended and an eos token appended. In any case, the
+                completion sequences will have an eos token appended.
+
+        Returns:
+            `dict[str, list[int]]`:
+                Tokenized sequences with the keys `"prompt_input_ids"`, `"chosen_input_ids"`, and
+                `"rejected_input_ids".
+
+        Example:
+        ```python
+        >>> from transformers import GPT2Tokenizer
+
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> features = {"prompt": "The sky is", "chosen": " blue", "rejected": " green"}
+        >>> DPOTrainer.tokenize_row(
+        ...     features, tokenizer, max_prompt_length=3, max_completion_length=3, add_special_tokens=False
+        ... )
+        {'prompt_input_ids': [464, 6766, 318], 'chosen_input_ids': [4171, 50256], 'rejected_input_ids': [4077, 50256]}
+        ```
+        """
+        tokenizer = processing_class  # the processing class is a tokenizer
+        prompt_input_ids = tokenizer(features["prompt"], add_special_tokens=False)["input_ids"]
+        chosen_input_ids = tokenizer(features["chosen"], add_special_tokens=False)["input_ids"]
+        rejected_input_ids = tokenizer(features["rejected"], add_special_tokens=False)["input_ids"]
+
+        # Add special tokens (typically for encoder-decoder models)
+        if add_special_tokens:
+            if tokenizer.bos_token_id is not None:
+                prompt_input_ids = [tokenizer.bos_token_id] + prompt_input_ids
+            if tokenizer.eos_token_id is not None:
+                prompt_input_ids = prompt_input_ids + [tokenizer.eos_token_id]
+        chosen_input_ids = chosen_input_ids + [tokenizer.eos_token_id]
+        rejected_input_ids = rejected_input_ids + [tokenizer.eos_token_id]
+
+        # Truncate prompt and completion sequences
+        if max_prompt_length is not None:
+            prompt_input_ids = prompt_input_ids[-max_prompt_length:]
+        if max_completion_length is not None:
+            chosen_input_ids = chosen_input_ids[:max_completion_length]
+            rejected_input_ids = rejected_input_ids[:max_completion_length]
+
+        return {
+            "prompt_input_ids": prompt_input_ids,
+            "chosen_input_ids": chosen_input_ids,
+            "rejected_input_ids": rejected_input_ids,
+        }
+
+    @staticmethod
+    def process_row(
+        features: dict[str, str],
+        processing_class: PreTrainedTokenizerBase,
+        max_prompt_length: Optional[int] = None,
+        max_completion_length: Optional[int] = None,
+        add_special_tokens: bool = True,
+    ) -> dict[str, list[int]]:
+        """
+        Same as `tokenize_row` but for vision models. Please refer to `tokenize_row` for more information.
+        """
+        processor, tokenizer = processing_class, processing_class.tokenizer  # the processing class is a processor
+        processed_features = processor(images=features["images"], text=features["prompt"], add_special_tokens=False)
+
+        prompt_input_ids = processed_features["input_ids"][0]
+        pixel_values = processed_features["pixel_values"][0]
+        chosen_input_ids = tokenizer(features["chosen"], add_special_tokens=False)["input_ids"]
+        rejected_input_ids = tokenizer(features["rejected"], add_special_tokens=False)["input_ids"]
+
+        # Add special tokens (typically for encoder-decoder models)
+        if add_special_tokens:
+            if tokenizer.bos_token_id is not None:
+                prompt_input_ids = [tokenizer.bos_token_id] + prompt_input_ids
+            if tokenizer.eos_token_id is not None:
+                prompt_input_ids = prompt_input_ids + [tokenizer.eos_token_id]
+        chosen_input_ids = chosen_input_ids + [tokenizer.eos_token_id]
+        rejected_input_ids = rejected_input_ids + [tokenizer.eos_token_id]
+
+        # Truncate prompt and completion sequences
+        if max_prompt_length is not None:
+            prompt_input_ids = prompt_input_ids[-max_prompt_length:]
+        if max_completion_length is not None:
+            chosen_input_ids = chosen_input_ids[:max_completion_length]
+            rejected_input_ids = rejected_input_ids[:max_completion_length]
+
+        output = {
+            "prompt_input_ids": prompt_input_ids,
+            "pixel_values": pixel_values,
+            "chosen_input_ids": chosen_input_ids,
+            "rejected_input_ids": rejected_input_ids,
+        }
+
+        if "pixel_attention_mask" in processed_features:
+            output["pixel_attention_mask"] = processed_features["pixel_attention_mask"][0]
+        if "image_sizes" in processed_features:
+            output["image_sizes"] = processed_features["image_sizes"][0]
+
+        return output
+
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In DPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by `DataCollatorForPreference`, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = [
+                "prompt_input_ids",
+                "chosen_input_ids",
+                "rejected_input_ids",
+                "image_sizes",
+                "ref_chosen_logps",
+                "ref_rejected_logps",
+            ]
+
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`.
+        """
+
+        if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs:
+            batch_size = self.args.precompute_ref_batch_size or self.args.per_device_train_batch_size
+            dataloader_params = {
+                "batch_size": batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(DataLoader(self.train_dataset, **dataloader_params))
+
+            ref_chosen_logps = []
+            ref_rejected_logps = []
+            for padded_batch in tqdm(iterable=data_loader, desc="Train dataset reference log probs"):
+                ref_chosen_logp, ref_rejected_logp = self.compute_ref_log_probs(padded_batch)
+                ref_chosen_logp, ref_rejected_logp = self.accelerator.gather_for_metrics(
+                    (ref_chosen_logp, ref_rejected_logp)
+                )
+                ref_chosen_logps.append(ref_chosen_logp.cpu())
+                ref_rejected_logps.append(ref_rejected_logp.cpu())
+
+                # Unnecessary cache clearing to avoid OOM
+                empty_cache()
+                self.accelerator.free_memory()
+
+            all_ref_chosen_logps = torch.cat(ref_chosen_logps).float().numpy()
+            all_ref_rejected_logps = torch.cat(ref_rejected_logps).float().numpy()
+
+            self.train_dataset = self.train_dataset.add_column(name="ref_chosen_logps", column=all_ref_chosen_logps)
+            self.train_dataset = self.train_dataset.add_column(
+                name="ref_rejected_logps", column=all_ref_rejected_logps
+            )
+
+            self._precomputed_train_ref_log_probs = True
+
+        return super().get_train_dataloader()
+
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`.
+
+        Args:
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
+                by the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs:
+            batch_size = self.args.precompute_ref_batch_size or self.args.per_device_eval_batch_size
+            dataloader_params = {
+                "batch_size": batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params))
+
+            ref_chosen_logps = []
+            ref_rejected_logps = []
+            for padded_batch in tqdm(iterable=data_loader, desc="Eval dataset reference log probs"):
+                ref_chosen_logp, ref_rejected_logp = self.compute_ref_log_probs(padded_batch)
+                ref_chosen_logp, ref_rejected_logp = self.accelerator.gather_for_metrics(
+                    (ref_chosen_logp, ref_rejected_logp)
+                )
+                ref_chosen_logps.append(ref_chosen_logp.cpu())
+                ref_rejected_logps.append(ref_rejected_logp.cpu())
+
+            all_ref_chosen_logps = torch.cat(ref_chosen_logps).float().numpy()
+            all_ref_rejected_logps = torch.cat(ref_rejected_logps).float().numpy()
+
+            eval_dataset = eval_dataset.add_column(name="ref_chosen_logps", column=all_ref_chosen_logps)
+            eval_dataset = eval_dataset.add_column(name="ref_rejected_logps", column=all_ref_rejected_logps)
+
+            # Save calculated ref_chosen_logps and ref_rejected_logps to the eval_dataset for subsequent runs
+            if self.eval_dataset is not None:
+                self.eval_dataset = eval_dataset
+            self._precomputed_eval_ref_log_probs = True
+
+        return super().get_eval_dataloader(eval_dataset=eval_dataset)
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.model_adapter_name or "default")
+
+    def compute_ref_log_probs(self, batch: dict[str, torch.LongTensor]) -> tuple[torch.Tensor, torch.Tensor]:
+        """Computes log probabilities of the reference model for a single padded batch of a DPO specific dataset."""
+        compte_ref_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with torch.no_grad(), compte_ref_context_manager:
+            if self.ref_model is None:
+                with self.null_ref_context():
+                    ref_model_output = self.concatenated_forward(self.model, batch, is_ref_model=True)
+            else:
+                ref_model_output = self.concatenated_forward(self.ref_model, batch, is_ref_model=True)
+        return ref_model_output["chosen_logps"], ref_model_output["rejected_logps"]
+
+    @staticmethod
+    def concatenated_inputs(
+        batch: dict[str, Union[list, torch.LongTensor]], padding_value: int
+    ) -> dict[str, torch.LongTensor]:
+        """
+        Concatenate the `chosen` and `rejected` inputs from the batch into a single tensor for both the prompt and
+        completion sequences.
+
+        Args:
+            batch (`dict[str, Union[list, torch.LongTensor]]`):
+                A batch of input data. The batch must contain the following keys:
+
+                - `"prompt_input_ids"`: Tensor of shape `(batch_size, prompt_length)` representing the prompt input
+                  IDs.
+                - `"chosen_input_ids"`: Tensor of shape `(batch_size, chosen_length)` representing the chosen
+                  completion input IDs.
+                - `"rejected_input_ids"`: Tensor of shape `(batch_size, rejected_length)` representing the rejected
+                  completion input IDs.
+                - `"prompt_pixel_values"` (optional): Tensor for pixel values, if available.
+                - `"prompt_pixel_attention_mask"` (optional): Tensor for pixel attention masks, if available.
+
+            padding_value (`int`):
+                The padding value to use for the concatenated completion sequences (`chosen_input_ids` and
+                `rejected_input_ids`).
+
+        Returns:
+            `dict[str, torch.LongTensor]`: A dictionary containing:
+
+                - `"prompt_input_ids"`: Concatenated prompt input IDs of shape `(2 * batch_size, prompt_length)`.
+                - `"completion_input_ids"`: Concatenated chosen and rejected completion input IDs of shape `(2 *
+                  batch_size, max_completion_length)`.
+                - `"prompt_attention_mask"`: Concatenated prompt attention masks of shape `(2 * batch_size,
+                  prompt_length)`.
+                - `"completion_attention_mask"`: Concatenated chosen and rejected attention masks of shape `(2 *
+                  batch_size, max_completion_length)`.
+                - `"pixel_values"` (optional): Concatenated pixel values if `"prompt_pixel_values"` are present.
+                - `"pixel_attention_mask"` (optional): Concatenated pixel attention masks if
+                  `"prompt_pixel_attention_mask"` are present.
+
+        Notes:
+            The completion input IDs and attention masks are padded to the maximum completion length of the chosen or
+            rejected sequences.
+        """
+        output = {}
+
+        # For the prompt, the input_ids are the same for both the chosen and rejected responses
+        output["prompt_input_ids"] = torch.cat([batch["prompt_input_ids"], batch["prompt_input_ids"]], dim=0)
+        output["prompt_attention_mask"] = torch.cat(
+            [batch["prompt_attention_mask"], batch["prompt_attention_mask"]], dim=0
+        )
+        if "pixel_values" in batch:
+            output["pixel_values"] = torch.cat([batch["pixel_values"], batch["pixel_values"]], dim=0)
+
+        if "pixel_attention_mask" in batch:
+            output["pixel_attention_mask"] = torch.cat(
+                [batch["pixel_attention_mask"], batch["pixel_attention_mask"]], dim=0
+            )
+        if "image_sizes" in batch:
+            output["image_sizes"] = torch.cat([batch["image_sizes"], batch["image_sizes"]], dim=0)
+
+        # Concatenate the chosen and rejected completions
+        max_completion_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
+        output["completion_input_ids"] = torch.cat(
+            (
+                pad_to_length(batch["chosen_input_ids"], max_completion_length, pad_value=padding_value),
+                pad_to_length(batch["rejected_input_ids"], max_completion_length, pad_value=padding_value),
+            ),
+        )
+        output["completion_attention_mask"] = torch.cat(
+            (
+                pad_to_length(batch["chosen_attention_mask"], max_completion_length, pad_value=0),
+                pad_to_length(batch["rejected_attention_mask"], max_completion_length, pad_value=0),
+            ),
+        )
+
+        return output
+
+    def dpo_loss(
+        self,
+        chosen_logps: torch.FloatTensor,
+        rejected_logps: torch.FloatTensor,
+        ref_chosen_logps: torch.FloatTensor,
+        ref_rejected_logps: torch.FloatTensor,
+        loss_type: str = "sigmoid",
+        model_output: dict[str, torch.FloatTensor] = None,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """
+        Compute the DPO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            chosen_logps (`torch.FloatTensor`):
+                Log probabilities of the model for the chosen responses. Shape: `(batch_size,)`.
+            rejected_logps (`torch.FloatTensor`):
+                Log probabilities of the model for the rejected responses. Shape: `(batch_size,)`.
+            ref_chosen_logps (`torch.FloatTensor`):
+                Log probabilities of the reference model for the chosen responses. Shape: `(batch_size,)`.
+            ref_rejected_logps (`torch.FloatTensor`):
+                Log probabilities of the reference model for the rejected responses. Shape: `(batch_size,)`.
+            loss_type (`str`, defaults to `"sigmoid"`):
+                The type of loss to compute. One of:
+                - `"sigmoid"`: Sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+                - `"hinge"`: Hinge loss on the normalized likelihood from the
+                  [SLiC](https://huggingface.co/papers/2305.10425) paper.
+                - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+                - `"exo_pair"`: Pairwise EXO loss from the [EXO](https://huggingface.co/papers/2402.00856) paper.
+                - `"nca_pair"`: Pairwise NCA loss from the [NCA](https://huggingface.co/papers/2402.05369) paper.
+                - `"robust"`: Unbiased estimate of the DPO loss that is robust to preference noise from the [Robust
+                  DPO](https://huggingface.co/papers/2403.00409) paper.
+                - `"bco_pair"`: Pairwise BCO loss from the [BCO](https://huggingface.co/papers/2404.04656) paper.
+                - `"sppo_hard"`: SPPO loss with hard label from the [SPPO](https://huggingface.co/papers/2405.00675)
+                  paper.
+                - `"aot"`: AOT loss for paired datasets from the [AOT](https://huggingface.co/papers/2406.05882) paper.
+                - `"aot_pair"`: AOT loss for unpaired datasets from the [AOT](https://huggingface.co/papers/2406.05882)
+                  paper.
+                - `"discopop"`: DiscoPOP (a.k.a Log-Ratio Modulated Loss, LRML) loss from the
+                  [DiscoPOP](https://huggingface.co/papers/2406.08414) paper.
+                - `"apo_zero"`: APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+                - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+                - `"sft"`: Negative log-likelihood loss (standard supervised fine-tuning loss).
+            model_output (`dict[str, torch.FloatTensor]`, *optional*):
+                The output of the model's forward pass. This is used to compute auxiliary losses if enabled.
+
+        Returns:
+            A tuple of three tensors: `(losses, chosen_rewards, rejected_rewards)`. The losses tensor contains the DPO
+            loss for each example in the batch. The `chosen_rewards` and `rejected_rewards` tensors contain the rewards
+            for the chosen and rejected responses, respectively.
+        """
+        device = self.accelerator.device
+
+        # Get the log ratios for the chosen and rejected responses
+        chosen_logratios = chosen_logps.to(device) - (not self.reference_free) * ref_chosen_logps.to(device)
+        rejected_logratios = rejected_logps.to(device) - (not self.reference_free) * ref_rejected_logps.to(device)
+
+        if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE.value:
+            # The alpha-divergence formula: (1 - u^-alpha) / alpha
+            # The divergence difference between the chosen and rejected sample is:
+            #     (1 - u[w]^-alpha) / alpha - (1 - u[l]^-alpha) / alpha
+            #        = (u[l]^-alpha - u[w]^-alpha) / alpha
+            # where u[w] and u[l] are the policy/reference probability ratios
+            # for the chosen and rejected samples, respectively.
+            alpha_coef = FDivergenceConstants.ALPHA_DIVERGENCE_COEF_DEFAULT
+            if self.f_divergence_params and FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY in self.f_divergence_params:
+                alpha_coef = float(self.f_divergence_params[FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY])
+            logits = (cap_exp(rejected_logratios * -alpha_coef) - cap_exp(chosen_logratios * -alpha_coef)) / alpha_coef
+        else:
+            logratios = chosen_logps - rejected_logps
+            if self.reference_free:
+                ref_logratios = torch.tensor([0], dtype=logratios.dtype, device=logratios.device)
+            else:
+                ref_logratios = ref_chosen_logps - ref_rejected_logps
+
+            logratios = logratios.to(self.accelerator.device)
+            ref_logratios = ref_logratios.to(self.accelerator.device)
+            logits = logratios - ref_logratios
+
+            if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE.value:
+                # The js-divergence formula: log(2 * u / (1 + u))
+                # The divergence difference between the chosen and rejected sample is:
+                #     log(2 * u[w] / (1 + u[w])) - log(2 * u[l] / (1 + u[l]))
+                #       = log(u[w]) - log(u[l]) - (log(1 + u[w]) - log(1 + u[l]))
+                # where u[w] and u[l] are the policy/reference probability ratios
+                # for the chosen and rejected samples, respectively.
+                logits -= F.softplus(chosen_logratios) - F.softplus(rejected_logratios)
+
+        # The beta is a temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5.
+        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the
+        # labels and calculates a conservative DPO loss.
+        if loss_type == "sigmoid":
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+
+        elif loss_type == "robust":
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                + F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            ) / (1 - 2 * self.label_smoothing)
+
+        elif loss_type == "exo_pair":
+            # eqn (16) of the EXO paper: https://huggingface.co/papers/2402.00856
+            import math
+
+            if self.label_smoothing == 0:
+                self.label_smoothing = 1e-3
+            losses = (self.beta * logits).sigmoid() * (
+                F.logsigmoid(self.beta * logits) - math.log(1 - self.label_smoothing)
+            ) + (-self.beta * logits).sigmoid() * (F.logsigmoid(-self.beta * logits) - math.log(self.label_smoothing))
+
+        elif loss_type == "hinge":
+            losses = torch.relu(1 - self.beta * logits)
+
+        elif loss_type == "ipo":
+            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper.
+            losses = (logits - 1 / (2 * self.beta)) ** 2
+
+        elif loss_type == "bco_pair":
+            chosen_logratios = chosen_logps - ref_chosen_logps
+            rejected_logratios = rejected_logps - ref_rejected_logps
+            chosen_rewards = self.beta * chosen_logratios
+            rejected_rewards = self.beta * rejected_logratios
+            rewards = torch.cat((chosen_rewards, rejected_rewards), 0).mean().detach()
+            self.running.update(rewards)
+            delta = self.running.mean
+            losses = -F.logsigmoid((self.beta * chosen_logratios) - delta) - F.logsigmoid(
+                -(self.beta * rejected_logratios - delta)
+            )
+
+        elif loss_type == "sppo_hard":
+            # In the paper (https://huggingface.co/papers/2405.00675), SPPO employs a soft probability approach,
+            # estimated using the PairRM score. The probability calculation is conducted outside of the trainer class.
+            # The version described here is the hard probability version, where P in Equation (4.7) of Algorithm 1 is
+            # set to 1 for the winner and 0 for the loser.
+            a = chosen_logps - ref_chosen_logps
+            b = rejected_logps - ref_rejected_logps
+            losses = (a - 0.5 / self.beta) ** 2 + (b + 0.5 / self.beta) ** 2
+
+        elif loss_type == "nca_pair":
+            chosen_rewards = (chosen_logps - ref_chosen_logps) * self.beta
+            rejected_rewards = (rejected_logps - ref_rejected_logps) * self.beta
+            losses = (
+                -F.logsigmoid(chosen_rewards)
+                - 0.5 * F.logsigmoid(-chosen_rewards)
+                - 0.5 * F.logsigmoid(-rejected_rewards)
+            )
+
+        elif loss_type == "aot_pair":
+            chosen_logratios = chosen_logps - ref_chosen_logps
+            rejected_logratios = rejected_logps - ref_rejected_logps
+            chosen_logratios_sorted, _ = torch.sort(chosen_logratios, dim=0)
+            rejected_logratios_sorted, _ = torch.sort(rejected_logratios, dim=0)
+            delta = chosen_logratios_sorted - rejected_logratios_sorted
+            losses = (
+                -F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * delta) * self.label_smoothing
+            )
+
+        elif loss_type == "aot":
+            logratios = chosen_logps - rejected_logps
+            ref_logratios = ref_chosen_logps - ref_rejected_logps
+            logratios_sorted, _ = torch.sort(logratios, dim=0)
+            ref_logratios_sorted, _ = torch.sort(ref_logratios, dim=0)
+            delta = logratios_sorted - ref_logratios_sorted
+            losses = (
+                -F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * delta) * self.label_smoothing
+            )
+
+        elif loss_type == "apo_zero":
+            # Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are better than your model's default output
+            losses_chosen = 1 - F.sigmoid(self.beta * chosen_logratios)  # Increase chosen likelihood
+            losses_rejected = F.sigmoid(self.beta * rejected_logratios)  # Decrease rejected likelihood
+            losses = losses_chosen + losses_rejected
+
+        elif loss_type == "apo_down":
+            # Eqn (8) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are worse than your model's default output.
+            # Decrease chosen likelihood and decrease rejected likelihood more
+            losses_chosen = F.sigmoid(self.beta * chosen_logratios)
+            losses_rejected = 1 - F.sigmoid(self.beta * (chosen_logratios - rejected_logratios))
+            losses = losses_chosen + losses_rejected
+
+        elif loss_type == "discopop":
+            # Eqn (5) of the DiscoPOP paper (https://huggingface.co/papers/2406.08414)
+            # This loss was discovered with LLM discovery
+            logratios = chosen_logps - rejected_logps
+            ref_logratios = ref_chosen_logps - ref_rejected_logps
+            logits = logratios - ref_logratios
+            logits = logits * self.beta
+            # Modulate the mixing coefficient based on the log ratio magnitudes
+            log_ratio_modulation = torch.sigmoid(logits / self.args.discopop_tau)
+            logistic_component = -F.logsigmoid(logits)
+            exp_component = torch.exp(-logits)
+            # Blend between logistic and exponential component based on log ratio modulation
+            losses = logistic_component * (1 - log_ratio_modulation) + exp_component * log_ratio_modulation
+
+        elif loss_type == "sft":
+            # SFT loss is the negative log likelihood loss on chosen responses
+            # This acts as the generation loss component in MPO
+            sft_loss = model_output["nll_loss"]
+            # Create losses tensor with same shape as other losses (per-sample)
+            batch_size = chosen_logps.shape[0]
+            losses = sft_loss.expand(batch_size)
+            # For SFT, we don't have preference rewards, so use zeros
+            chosen_rewards = torch.zeros_like(chosen_logps)
+            rejected_rewards = torch.zeros_like(rejected_logps)
+
+        else:
+            raise ValueError(
+                f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'exo_pair', "
+                "'nca_pair', 'robust', 'bco_pair', 'sppo_hard', 'aot', 'aot_pair', 'discopop', 'apo_zero', "
+                "'apo_down', 'sft']"
+            )
+
+        chosen_rewards = self.beta * (chosen_logps.to(device) - ref_chosen_logps.to(device)).detach()
+        rejected_rewards = self.beta * (rejected_logps.to(device) - ref_rejected_logps.to(device)).detach()
+
+        return losses, chosen_rewards, rejected_rewards
+
+    def _compute_loss_liger(
+        self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> dict[str, torch.Tensor]:
+        unwrapped_model = self.accelerator.unwrap_model(model)
+        concatenated_batch = self.concatenated_inputs(batch, padding_value=self.padding_value)
+
+        model_kwargs = {}
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        # Add the pixel values and attention masks for vision models
+        if "pixel_values" in concatenated_batch:
+            model_kwargs["pixel_values"] = concatenated_batch["pixel_values"]
+        if "pixel_attention_mask" in concatenated_batch:
+            model_kwargs["pixel_attention_mask"] = concatenated_batch["pixel_attention_mask"]
+        if "image_sizes" in concatenated_batch:
+            model_kwargs["image_sizes"] = concatenated_batch["image_sizes"]
+
+        prompt_attention_mask = concatenated_batch["prompt_attention_mask"]
+        completion_attention_mask = concatenated_batch["completion_attention_mask"]
+
+        if self.is_encoder_decoder:
+            # 1. Get encoder outputs
+            encoder_outputs = unwrapped_model.get_encoder()(
+                concatenated_batch["prompt_input_ids"],
+                attention_mask=concatenated_batch["prompt_attention_mask"],
+                return_dict=True,
+            )
+            # 2. Prepare decoder inputs
+            decoder_input_ids = shift_tokens_right(
+                concatenated_batch["completion_input_ids"],
+                unwrapped_model.config.decoder_start_token_id,
+            )
+            # 3. Get decoder outputs
+            decoder_outputs = unwrapped_model.get_decoder()(
+                input_ids=decoder_input_ids,
+                attention_mask=concatenated_batch["completion_attention_mask"],
+                encoder_hidden_states=encoder_outputs.last_hidden_state,
+                encoder_attention_mask=concatenated_batch["prompt_attention_mask"],
+                use_cache=False,
+            )
+            hidden_states = decoder_outputs.last_hidden_state
+
+            ref_hidden_states = None
+            if not self.reference_free and self.ref_model is not None:
+                unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model)
+                ref_encoder_outputs = unwrapped_ref_model.get_encoder()(
+                    concatenated_batch["prompt_input_ids"],
+                    attention_mask=concatenated_batch["prompt_attention_mask"],
+                    return_dict=True,
+                )
+                ref_decoder_outputs = unwrapped_ref_model.get_decoder()(
+                    input_ids=decoder_input_ids,
+                    attention_mask=concatenated_batch["completion_attention_mask"],
+                    encoder_hidden_states=ref_encoder_outputs.last_hidden_state,
+                    encoder_attention_mask=concatenated_batch["prompt_attention_mask"],
+                    use_cache=False,
+                )
+                ref_hidden_states = ref_decoder_outputs.last_hidden_state
+            elif not self.reference_free:
+                with self.null_ref_context():
+                    ref_encoder_outputs = unwrapped_model.get_encoder()(
+                        concatenated_batch["prompt_input_ids"],
+                        attention_mask=concatenated_batch["prompt_attention_mask"],
+                        return_dict=True,
+                    )
+                    ref_decoder_outputs = unwrapped_model.get_decoder()(
+                        input_ids=decoder_input_ids,
+                        attention_mask=concatenated_batch["completion_attention_mask"],
+                        encoder_hidden_states=ref_encoder_outputs.last_hidden_state,
+                        encoder_attention_mask=concatenated_batch["prompt_attention_mask"],
+                        use_cache=False,
+                    )
+                    ref_hidden_states = ref_decoder_outputs.last_hidden_state
+
+            labels = concatenated_batch["completion_input_ids"]
+            loss_mask = completion_attention_mask.bool()
+        else:
+            # For decoder-only models
+            input_ids = torch.cat(
+                (concatenated_batch["prompt_input_ids"], concatenated_batch["completion_input_ids"]), dim=1
+            )
+            attention_mask = torch.cat(
+                (concatenated_batch["prompt_attention_mask"], concatenated_batch["completion_attention_mask"]),
+                dim=1,
+            )
+            # Mask the prompt but not the completion for the loss
+            loss_mask = torch.cat(
+                (torch.zeros_like(prompt_attention_mask), completion_attention_mask),
+                dim=1,
+            )
+
+            # Flush and truncate
+            if self.max_length is not None and self.max_length < attention_mask.size(1):
+                if self.truncation_mode == "keep_start":
+                    # Flush left to reduce the memory usage
+                    # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+                    #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+                    attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask)
+                    attention_mask = attention_mask[:, : self.max_length]
+                    input_ids = input_ids[:, : self.max_length]
+                    loss_mask = loss_mask[:, : self.max_length]
+                elif self.truncation_mode == "keep_end":
+                    # Flush right before truncating left, then flush left
+                    # [[0, 0, x, x, x, x],  ->  [[0, 0, x, x],
+                    #  [0, x, x, x, 0, 0]]       [0, x, x, x]]
+                    attention_mask, input_ids, loss_mask = flush_right(attention_mask, input_ids, loss_mask)
+                    input_ids = input_ids[:, -self.max_length :]
+                    attention_mask = attention_mask[:, -self.max_length :]
+                    loss_mask = loss_mask[:, -self.max_length :]
+                    attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask)
+                else:
+                    raise ValueError(
+                        f"Unknown truncation mode: '{self.truncation_mode}'. Should be one of ['keep_end', "
+                        "'keep_start']."
+                    )
+            else:
+                # Flush left to reduce the memory usage
+                # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+                #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+                attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask)
+
+            # Add logits_to_keep optimization
+            if self.use_logits_to_keep:
+                first_compute_index = loss_mask.nonzero(as_tuple=True)[1].min()
+                logits_to_keep = (loss_mask.shape[1] - first_compute_index).item() + 1
+                model_kwargs["logits_to_keep"] = logits_to_keep
+
+            model_kwargs["output_hidden_states"] = True
+
+            # Add padding-free training support
+            if self.padding_free:
+                input_ids = input_ids[attention_mask.bool()].unsqueeze(0)
+                loss_mask = loss_mask[attention_mask.bool()].unsqueeze(0)
+                position_ids = attention_mask.cumsum(1)[attention_mask.bool()].unsqueeze(0) - 1
+                model_kwargs["position_ids"] = position_ids
+            else:
+                model_kwargs["attention_mask"] = attention_mask
+
+            # Get the base model outputs (before LM head)
+            if hasattr(unwrapped_model, "get_decoder") and unwrapped_model.get_decoder() is not None:
+                base_model = unwrapped_model.get_decoder()
+            else:
+                base_attr = getattr(unwrapped_model, "base_model_prefix", self.args.base_model_attribute_name)
+                base_model = getattr(unwrapped_model, base_attr, unwrapped_model)
+
+            outputs = base_model(
+                input_ids,
+                use_cache=False,
+                **model_kwargs,
+            )
+            hidden_states = outputs.last_hidden_state[:, :-1]
+
+            # Get reference hidden states if needed
+            ref_hidden_states = None
+            if not self.reference_free and self.ref_model is not None:
+                unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model)
+                if hasattr(unwrapped_ref_model, "get_decoder") and unwrapped_ref_model.get_decoder() is not None:
+                    ref_base_model = unwrapped_ref_model.get_decoder()
+                else:
+                    ref_attr = getattr(unwrapped_ref_model, "base_model_prefix", self.args.base_model_attribute_name)
+                    ref_base_model = getattr(unwrapped_ref_model, ref_attr, unwrapped_ref_model)
+
+                ref_outputs = ref_base_model(
+                    input_ids,
+                    use_cache=False,
+                    **model_kwargs,
+                )
+                ref_hidden_states = ref_outputs.last_hidden_state[:, :-1]
+            elif not self.reference_free:
+                if hasattr(unwrapped_model, "get_decoder") and unwrapped_model.get_decoder() is not None:
+                    ref_base_model = unwrapped_model.get_decoder()
+                else:
+                    ref_attr = getattr(unwrapped_model, "base_model_prefix", self.args.base_model_attribute_name)
+                    ref_base_model = getattr(unwrapped_model, ref_attr, unwrapped_model)
+                with self.null_ref_context():
+                    ref_outputs = ref_base_model(
+                        input_ids,
+                        use_cache=False,
+                        **model_kwargs,
+                    )
+                    ref_hidden_states = ref_outputs.last_hidden_state[:, :-1]
+
+            masked_input_ids = torch.where(loss_mask != 0, input_ids, self.label_pad_token_id)
+            labels = masked_input_ids[:, 1:]  # Shift right for casual LM
+
+        # Get the LM head
+        lm_head = unwrapped_model.get_output_embeddings()
+
+        # Get reference model weights if needed
+        ref_weight = None
+        ref_bias = None
+        if not self.reference_free:
+            if self.ref_model is not None:
+                unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model)
+                ref_lm_head = unwrapped_ref_model.get_output_embeddings()
+            else:
+                with self.null_ref_context():
+                    ref_lm_head = unwrapped_model.get_output_embeddings()
+            ref_weight = ref_lm_head.weight
+            ref_bias = ref_lm_head.bias if hasattr(ref_lm_head, "bias") else None
+
+        # Compute loss using Liger kernel
+        loss_output = self.dpo_loss_fn(
+            lm_head.weight,
+            hidden_states,
+            labels,
+            bias=lm_head.bias if hasattr(lm_head, "bias") else None,
+            ref_input=ref_hidden_states if not self.reference_free else None,
+            ref_weight=ref_weight if not self.reference_free else None,
+            ref_bias=ref_bias if not self.reference_free else None,
+        )
+        (
+            loss,
+            (chosen_logps, rejected_logps, chosen_logits_mean, rejected_logits_mean, nll_loss, *aux_outputs),
+        ) = loss_output
+
+        output = {
+            "loss": loss,
+            "chosen_logps": chosen_logps,
+            "rejected_logps": rejected_logps,
+            "mean_chosen_logits": chosen_logits_mean,
+            "mean_rejected_logits": rejected_logits_mean,
+            "nll_loss": nll_loss,
+            "chosen_rewards": aux_outputs[0],
+            "rejected_rewards": aux_outputs[1],
+        }
+        if self.aux_loss_enabled:
+            output["aux_loss"] = outputs.aux_loss
+
+        return output
+
+    def concatenated_forward(
+        self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]], is_ref_model: bool = False
+    ) -> dict[str, torch.Tensor]:
+        """
+        Runs the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+
+        Args:
+            model:
+                Model to run the forward pass on.
+            batch:
+                Batch of input data.
+            is_ref_model:
+                Whether this method is being called for the reference model. If `True`, length desensitization is not
+                applied.
+        """
+        num_examples = batch["prompt_input_ids"].shape[0]
+
+        concatenated_batch = self.concatenated_inputs(batch, padding_value=self.padding_value)
+
+        model_kwargs = {"use_cache": False}
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        # Add the pixel values and attention masks for vision models
+        if "pixel_values" in concatenated_batch:
+            model_kwargs["pixel_values"] = concatenated_batch["pixel_values"]
+        if "pixel_attention_mask" in concatenated_batch:
+            model_kwargs["pixel_attention_mask"] = concatenated_batch["pixel_attention_mask"]
+        if "image_sizes" in concatenated_batch:
+            model_kwargs["image_sizes"] = concatenated_batch["image_sizes"]
+
+        prompt_input_ids = concatenated_batch["prompt_input_ids"]
+        prompt_attention_mask = concatenated_batch["prompt_attention_mask"]
+        completion_input_ids = concatenated_batch["completion_input_ids"]
+        completion_attention_mask = concatenated_batch["completion_attention_mask"]
+        if self.is_encoder_decoder:
+            labels = completion_input_ids
+            labels[completion_attention_mask == 0] = self.label_pad_token_id
+            outputs = model(
+                input_ids=prompt_input_ids,
+                attention_mask=prompt_attention_mask,
+                labels=labels,  # we need the labels for the logits to be returned
+                **model_kwargs,
+            )
+            logits = outputs.logits
+            loss_mask = completion_attention_mask.bool()
+        else:
+            # Concatenate the prompt and completion inputs
+            input_ids = torch.cat((prompt_input_ids, completion_input_ids), dim=1)
+            attention_mask = torch.cat((prompt_attention_mask, completion_attention_mask), dim=1)
+            # Mask the prompt but not the completion for the loss
+            loss_mask = torch.cat(
+                (torch.zeros_like(prompt_attention_mask), completion_attention_mask),
+                dim=1,
+            )
+
+            # Flush and truncate
+            if self.max_length is not None and self.max_length < attention_mask.size(1):
+                if self.truncation_mode == "keep_start":
+                    # Flush left to reduce the memory usage
+                    # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+                    #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+                    attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask)
+                    attention_mask = attention_mask[:, : self.max_length]
+                    input_ids = input_ids[:, : self.max_length]
+                    loss_mask = loss_mask[:, : self.max_length]
+                elif self.truncation_mode == "keep_end":
+                    # Flush right before truncating left, then flush left
+                    # [[0, 0, x, x, x, x],  ->  [[0, 0, x, x],
+                    #  [0, x, x, x, 0, 0]]       [0, x, x, x]]
+                    attention_mask, input_ids, loss_mask = flush_right(attention_mask, input_ids, loss_mask)
+                    input_ids = input_ids[:, -self.max_length :]
+                    attention_mask = attention_mask[:, -self.max_length :]
+                    loss_mask = loss_mask[:, -self.max_length :]
+                    attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask)
+                else:
+                    raise ValueError(
+                        f"Unknown truncation mode: '{self.truncation_mode}'. Should be one of ['keep_end', "
+                        "'keep_start']."
+                    )
+            else:
+                # Flush left to reduce the memory usage
+                # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+                #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+                attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask)
+
+            if self.use_logits_to_keep:
+                # Compute logits_to_keep based on loss_mask pattern:
+                # [[0, 0, 0, x, x, x, x],
+                #  [0, 0, 0, x, x, x, 0]]
+                #         ^ start computing logits from here ([:, -(7-3+1):])
+                first_compute_index = loss_mask.nonzero(as_tuple=True)[1].min()
+                logits_to_keep = (loss_mask.shape[1] - first_compute_index).item() + 1  # +1 for the first label
+                model_kwargs["logits_to_keep"] = logits_to_keep
+
+            model_kwargs["output_hidden_states"] = True
+
+            if self.padding_free:
+                # Flatten the input_ids, position_ids, and loss_mask
+                # input_ids = [[a, b, c, 0], ->     input_ids = [[a, b, c, d, e, f, g]]
+                #              [d, e, f, g]]     position_ids = [[0, 1, 2, 0, 1, 2, 3]]
+                input_ids = input_ids[attention_mask.bool()].unsqueeze(0)
+                loss_mask = loss_mask[attention_mask.bool()].unsqueeze(0)
+                position_ids = attention_mask.cumsum(1)[attention_mask.bool()].unsqueeze(0) - 1
+                model_kwargs["position_ids"] = position_ids
+            else:
+                model_kwargs["attention_mask"] = attention_mask
+
+            outputs = model(input_ids, **model_kwargs)
+            logits = outputs.logits
+
+            # Offset the logits by one to align with the labels
+            labels = torch.roll(input_ids, shifts=-1, dims=1)
+            loss_mask = torch.roll(loss_mask, shifts=-1, dims=1).bool()
+
+            if self.use_logits_to_keep:
+                # Align labels with logits
+                # logits:    -,  -, [x2, x3, x4, x5, x6]
+                #                     ^ --------- ^       after logits[:, :-1, :]
+                # labels:   [y0, y1, y2, y3, y4, y5, y6]
+                #                         ^ --------- ^   with logits_to_keep=4, [:, -4:]
+                # loss_mask: [0,  0,  0,  1,  1,  1,  1]
+                labels = labels[:, -logits_to_keep:]
+                loss_mask = loss_mask[:, -logits_to_keep:]
+
+        if logits.shape[:2] != labels.shape[:2]:
+            # for LLaVA, the returned logits include the image tokens (placed before the text tokens)
+            seq_len = labels.shape[1]
+            logits = logits[:, -seq_len:]
+
+        # Compute the log probabilities of the labels
+        labels[~loss_mask] = 0  # dummy token; we'll ignore the losses on these tokens later
+        per_token_logps = selective_log_softmax(logits, labels)
+        per_token_logps[~loss_mask] = 0
+        per_token_logps = torch.roll(per_token_logps, shifts=1, dims=1)
+
+        if self.padding_free:
+            # Unflatten the per_token_logps (shape: [1, sum_seq_len] -> [batch_size, seq_len])
+            batch_size, seq_len = attention_mask.shape
+            per_token_logps_ = torch.zeros(
+                batch_size, seq_len, device=outputs.logits.device, dtype=outputs.logits.dtype
+            )
+            per_token_logps_[attention_mask.bool()] = per_token_logps
+            per_token_logps = per_token_logps_
+
+        all_logps = per_token_logps[:, 1:].sum(-1)
+
+        output = {}
+
+        if self.use_weighting:
+            with torch.no_grad():
+                # Eq (2) of the WPO paper: https://huggingface.co/papers/2406.11827
+                logprobs = F.log_softmax(logits, dim=-1)
+                weights_adjustment_factor = torch.logsumexp(2 * logprobs, dim=-1)  # same as sum(probs**2) in log space
+                per_token_logps_adjusted = per_token_logps - weights_adjustment_factor
+                all_weights = (per_token_logps_adjusted * loss_mask).sum(-1) / loss_mask.sum(-1)
+                chosen_weights = all_weights[:num_examples]
+                rejected_weights = all_weights[num_examples:]
+                output["policy_weights"] = torch.clamp(torch.exp(chosen_weights + rejected_weights), max=1)
+
+        if self.args.rpo_alpha is not None or "sft" in self.loss_type:
+            # Only use the chosen logits for the RPO loss or SFT loss
+            chosen_logits = logits[:num_examples, :-1] if not self.is_encoder_decoder else logits[:num_examples]
+            chosen_labels = labels[:num_examples, :-1] if not self.is_encoder_decoder else labels[:num_examples]
+
+            # Compute the log probabilities of the labels
+            output["nll_loss"] = F.cross_entropy(
+                torch.flatten(chosen_logits, end_dim=1), torch.flatten(chosen_labels, end_dim=1), ignore_index=0
+            )
+
+        if "ipo" in self.loss_type:
+            all_logps = all_logps / loss_mask.sum(-1)
+
+        if self.args.ld_alpha is not None and not is_ref_model:
+            # Compute response lengths based on loss_mask
+            completion_lengths = loss_mask.sum(dim=1)
+
+            chosen_lengths = completion_lengths[:num_examples]
+            rejected_lengths = completion_lengths[num_examples:]
+            public_lengths = torch.min(chosen_lengths, rejected_lengths)  # l_p in the paper
+            public_lengths = torch.cat([public_lengths, public_lengths], dim=0)
+
+            seq_len = per_token_logps.size(1)
+            position_ids = torch.arange(seq_len, device=per_token_logps.device).expand_as(per_token_logps)
+
+            ld_mask = position_ids < public_lengths.unsqueeze(1)
+            mask = position_ids < completion_lengths.unsqueeze(1)
+
+            front_mask = (ld_mask & mask).float()
+            rear_mask = (~ld_mask & mask).float()
+            front_logps = (per_token_logps * front_mask).sum(dim=1)
+            rear_logps = (per_token_logps * rear_mask).sum(dim=1)
+
+            all_logps = front_logps + self.args.ld_alpha * rear_logps
+
+        output["chosen_logps"] = all_logps[:num_examples]
+        output["rejected_logps"] = all_logps[num_examples:]
+
+        # Compute the mean logits
+        if self.padding_free:
+            # position_ids contains a sequence of range identifiers (e.g., [[0, 1, 2, 0, 1, 2, 3, ...]]).
+            # There are 2*num_examples ranges in total: the first half corresponds to the chosen tokens,
+            # and the second half to the rejected tokens.
+            # To find the start of the rejected tokens, we look for the num_examples+1-th zero in pos_id.
+            split_idx = (position_ids == 0).nonzero(as_tuple=True)[1][num_examples]
+            mean_chosen_logits = logits[0, :split_idx][loss_mask[0, :split_idx]].mean()
+            mean_rejected_logits = logits[0, split_idx:][loss_mask[0, split_idx:]].mean()
+        else:
+            mean_chosen_logits = logits[:num_examples][loss_mask[:num_examples]].mean()
+            mean_rejected_logits = logits[num_examples:][loss_mask[num_examples:]].mean()
+
+        output["mean_chosen_logits"] = mean_chosen_logits
+        output["mean_rejected_logits"] = mean_rejected_logits
+
+        if self.aux_loss_enabled:
+            output["aux_loss"] = outputs.aux_loss
+
+        return output
+
+    def get_batch_loss_metrics(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        batch: dict[str, Union[list, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ) -> tuple[torch.Tensor, dict[str, float]]:
+        """Compute the DPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+
+        if self.args.use_liger_loss:
+            model_output = self._compute_loss_liger(model, batch)
+            losses = model_output["loss"]
+            chosen_rewards = model_output["chosen_rewards"]
+            rejected_rewards = model_output["rejected_rewards"]
+        else:
+            model_output = self.concatenated_forward(model, batch)
+
+            # if ref_chosen_logps and ref_rejected_logps in batch use them, otherwise use the reference model
+            if "ref_chosen_logps" in batch and "ref_rejected_logps" in batch:
+                ref_chosen_logps = batch["ref_chosen_logps"]
+                ref_rejected_logps = batch["ref_rejected_logps"]
+            else:
+                ref_chosen_logps, ref_rejected_logps = self.compute_ref_log_probs(batch)
+
+            # Initialize combined losses
+            losses = 0
+            chosen_rewards = 0
+            rejected_rewards = 0
+
+            # Compute losses for each loss type
+            for idx, loss_type in enumerate(self.loss_type):
+                # Compute individual loss using standard DPO loss function
+                _losses, _chosen_rewards, _rejected_rewards = self.dpo_loss(
+                    model_output["chosen_logps"],
+                    model_output["rejected_logps"],
+                    ref_chosen_logps,
+                    ref_rejected_logps,
+                    loss_type,
+                    model_output,
+                )
+
+                # Add weighted contributions
+                weight = self.loss_weights[idx] if self.loss_weights else 1.0
+                losses = losses + _losses * weight
+                chosen_rewards = chosen_rewards + _chosen_rewards * weight
+                rejected_rewards = rejected_rewards + _rejected_rewards * weight
+
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        if self.args.rpo_alpha is not None:
+            losses = losses + self.args.rpo_alpha * model_output["nll_loss"]  # RPO loss from V3 of the paper
+
+        if self.use_weighting:
+            losses = losses * model_output["policy_weights"]
+
+        if self.aux_loss_enabled:
+            losses = losses + self.aux_loss_coef * model_output["aux_loss"]
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
+        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
+        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
+        metrics[f"{prefix}rewards/margins"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards).mean().item()
+        )
+        metrics[f"{prefix}logps/chosen"] = (
+            self.accelerator.gather_for_metrics(model_output["chosen_logps"]).detach().mean().item()
+        )
+        metrics[f"{prefix}logps/rejected"] = (
+            self.accelerator.gather_for_metrics(model_output["rejected_logps"]).detach().mean().item()
+        )
+        metrics[f"{prefix}logits/chosen"] = (
+            self.accelerator.gather_for_metrics(model_output["mean_chosen_logits"]).detach().mean().item()
+        )
+        metrics[f"{prefix}logits/rejected"] = (
+            self.accelerator.gather_for_metrics(model_output["mean_rejected_logits"]).detach().mean().item()
+        )
+        if self.args.rpo_alpha is not None or "sft" in self.loss_type:
+            metrics[f"{prefix}nll_loss"] = (
+                self.accelerator.gather_for_metrics(model_output["nll_loss"]).detach().mean().item()
+            )
+        if self.aux_loss_enabled:
+            metrics[f"{prefix}aux_loss"] = (
+                self.accelerator.gather_for_metrics(model_output["aux_loss"]).detach().mean().item()
+            )
+
+        return losses.mean(), metrics
+
+    def compute_loss(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, float]]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
+
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return loss, metrics
+
+        return loss
+
+    def generate_from_model_and_ref(self, model, batch: dict[str, torch.LongTensor]) -> tuple[str, str]:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.padding_value,
+            )
+
+            # if ref_output in batch use that otherwise use the reference model
+            if "ref_output" in batch:
+                ref_output = batch["ref_output"]
+            else:
+                if self.ref_model is None:
+                    with self.null_ref_context():
+                        ref_output = self.model.generate(
+                            input_ids=batch["prompt_input_ids"],
+                            attention_mask=batch["prompt_attention_mask"],
+                            max_length=self.max_length,
+                            do_sample=True,
+                            pad_token_id=self.padding_value,
+                        )
+                else:
+                    ref_output = self.ref_model.generate(
+                        input_ids=batch["prompt_input_ids"],
+                        attention_mask=batch["prompt_attention_mask"],
+                        max_length=self.max_length,
+                        do_sample=True,
+                        pad_token_id=self.padding_value,
+                    )
+
+        policy_output = pad_to_length(policy_output, self.max_length, self.padding_value)
+        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
+
+        ref_output = pad_to_length(ref_output, self.max_length, self.padding_value)
+        ref_output_decoded = self.processing_class.batch_decode(ref_output, skip_special_tokens=True)
+
+        return policy_output_decoded, ref_output_decoded
+
+    def prediction_step(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[list[str]] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval")
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return loss.detach(), None, None
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[list[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded, ref_output_decoded = self.generate_from_model_and_ref(self.model, random_batch)
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy", "Ref Model"],
+                data=[
+                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
+                    for prompt, pol, ref in zip(
+                        random_batch_dataset["prompt"], policy_output_decoded, ref_output_decoded
+                    )
+                ],
+            )
+            if "wandb" in self.args.report_to and self.accelerator.is_main_process:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+            if "mlflow" in self.args.report_to and self.accelerator.is_main_process:
+                mlflow.log_table(data=table, artifact_file="game_log.json")
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float` or `None`, *optional*, defaults to `None`):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent(
+            """\
+            @inproceedings{rafailov2023direct,
+                title        = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}},
+                author       = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn},
+                year         = 2023,
+                booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023},
+                url          = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html},
+                editor       = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine},
+            }"""
+        )
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="DPO",
+            trainer_citation=citation,
+            paper_title="Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
+            paper_id="2305.18290",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothDPOTrainer(_UnslothDPOTrainer):
+    """
+    
+Trainer for Direct Preference Optimization (DPO) method.
+
+This class is a wrapper around the [`transformers.Trainer`] class and inherits all of its attributes and methods.
+
+Args:
+    model (`Union[str, PreTrainedModel]`):
+        Model to be trained. Can be either:
+
+        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+          path to a *directory* containing model weights saved using
+          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+          `args.model_init_kwargs`.
+        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+    ref_model (`PreTrainedModelWrapper`):
+        Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+        and loss. If no reference model is provided, the trainer will create a reference model with the same
+        architecture as the model to be optimized.
+    args ([`DPOConfig`], *optional*, defaults to `None`):
+        Configuration for this trainer. If `None`, a default configuration is used.
+    data_collator (`DataCollator`, *optional*):
+        Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
+        Will default to [`DataCollatorForPreference`].
+    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+        Dataset to use for training. DPO supports [preference](#preference) type and. The format of the samples can
+        be either:
+
+        - [Standard](dataset_formats#standard): Each sample contains plain text.
+        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+          and content).
+    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+        Processing class used to process the data. If `None`, the processing class is loaded from the model's name
+        with [`~transformers.AutoTokenizer.from_pretrained`].
+    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+        The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
+        a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to
+        `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered
+        after the last eval batch to signal that the function needs to calculate and return the global summary
+        statistics rather than accumulating the batch-level statistics.
+    callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
+        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+        in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+        method.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+        A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+        model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+    optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*, defaults to `None`):
+        A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
+        `args`. Incompatible with the `optimizers` argument.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*, defaults to `None`):
+        A function that preprocess the logits right before caching them at each evaluation step. Must take two
+        tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
+        by this function will be reflected in the predictions received by `compute_metrics`.
+
+        Note that the labels (second parameter) will be `None` if the dataset does not have them.
+    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+
+    """
+    def __init__(
+        self,
+        model,
+        ref_model = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        compute_metrics = None,
+        callbacks = None,
+        optimizer_cls_and_kwargs = None,
+        preprocess_logits_for_metrics = None,
+        peft_config = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothDPOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('dpo_trainer', other_metrics)
+        if hasattr(train_dataset, 'column_names'):
+            column_names = set(train_dataset.column_names)
+            check = ['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask',
+                     'chosen_labels', 'rejected_input_ids', 'rejected_attention_mask', 'rejected_labels',
+                     'prompt_input_ids', 'prompt_attention_mask']
+            if all(x in column_names for x in check):
+                train_dataset = train_dataset.remove_columns(['chosen', 'rejected', 'prompt'])
+            del check, column_names
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            ref_model = ref_model,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            optimizer_cls_and_kwargs = optimizer_cls_and_kwargs,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            peft_config = peft_config,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothGKDTrainer.py b/unsloth_compiled_cache/UnslothGKDTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..70a6a78061a2b11073b086a5c965d3dba78fb272
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothGKDTrainer.py
@@ -0,0 +1,1220 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.gkd_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, Callable, DataCollator, DataCollatorForChatML, Dataset, EvalPrediction, F, FeatureExtractionMixin, GKDConfig, GKDTrainer, GenerationConfig, Optional, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SFTTrainer, TrainerCallback, Union, disable_dropout_in_model, empty_cache, generate_model_card, get_comet_experiment_url, is_wandb_available, nn, os, prepare_deepspeed, random, textwrap, torch, unwrap_model_for_generation)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothGKDConfig(GKDConfig):
+    """
+    
+Configuration class for [`GKDTrainer`].
+
+This class includes only the parameters that are specific to GKD training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] and [`SFTConfig`] documentation.
+
+Args:
+    temperature (`float`, *optional*, defaults to `0.9`):
+        Temperature for sampling. The higher the temperature, the more random the completions.
+    lmbda (`float`, *optional*, defaults to `0.5`):
+        Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy
+        student-generated outputs).
+    beta (`float`, *optional*, defaults to `0.5`):
+        Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence loss. When
+        beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
+    max_new_tokens (`int`, *optional*, defaults to `128`):
+        Maximum number of tokens to generate per completion.
+    teacher_model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
+        Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
+        trained.
+    teacher_model_init_kwargs (`dict[str, Any]]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
+        from a string.
+    disable_dropout (`bool`, *optional*, defaults to `True`):
+        Whether to disable dropout in the model.
+    seq_kd (`bool`, *optional*, defaults to `False`):
+        Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised FT on
+        teacher-generated output).
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        model_init_kwargs = None,
+        chat_template_path = None,
+        dataset_text_field = 'text',
+        dataset_kwargs = None,
+        dataset_num_proc = None,
+        eos_token = None,
+        pad_token = None,
+        max_length = 1024,
+        packing = False,
+        packing_strategy = 'bfd',
+        padding_free = False,
+        pad_to_multiple_of = None,
+        eval_packing = None,
+        completion_only_loss = None,
+        assistant_only_loss = False,
+        loss_type = 'nll',
+        activation_offloading = False,
+        temperature = 0.9,
+        lmbda = 0.5,
+        beta = 0.5,
+        max_new_tokens = 128,
+        teacher_model_name_or_path = None,
+        teacher_model_init_kwargs = None,
+        disable_dropout = True,
+        seq_kd = False,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        if os.environ.get('UNSLOTH_ENABLE_FLEX_ATTENTION', '0') == '1':
+            from unsloth_zoo.flex_attention import HAS_FLEX_ATTENTION
+            if HAS_FLEX_ATTENTION and pad_to_multiple_of is None:
+                from unsloth_zoo.flex_attention import FLEX_ATTENTION_BLOCK_SIZE
+                pad_to_multiple_of = FLEX_ATTENTION_BLOCK_SIZE
+        
+        if temperature <= 0:
+            raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.')
+        elif temperature >= 10:
+            raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.')
+        
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            model_init_kwargs = model_init_kwargs,
+            chat_template_path = chat_template_path,
+            dataset_text_field = dataset_text_field,
+            dataset_kwargs = dataset_kwargs,
+            dataset_num_proc = dataset_num_proc,
+            eos_token = eos_token,
+            pad_token = pad_token,
+            max_length = max_length,
+            packing = packing,
+            packing_strategy = packing_strategy,
+            padding_free = padding_free,
+            pad_to_multiple_of = pad_to_multiple_of,
+            eval_packing = eval_packing,
+            completion_only_loss = completion_only_loss,
+            assistant_only_loss = assistant_only_loss,
+            loss_type = loss_type,
+            activation_offloading = activation_offloading,
+            temperature = temperature,
+            lmbda = lmbda,
+            beta = beta,
+            max_new_tokens = max_new_tokens,
+            teacher_model_name_or_path = teacher_model_name_or_path,
+            teacher_model_init_kwargs = teacher_model_init_kwargs,
+            disable_dropout = disable_dropout,
+            seq_kd = seq_kd,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothGKDTrainer(SFTTrainer):
+    """Trainer for Generalized Knowledge Distillation (GKD) of language models.
+
+    For details on GKD, see the paper: [On-Policy Distillation of Language Models: Learning from Self-Generated
+    Mistakes](https://huggingface.co/papers/2306.13649).
+
+    Args:
+        model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
+            Model to be trained, or the string identifier of the model to be instantiated from a pretrained model.
+        teacher_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
+            Teacher model for knowledge distillation, or the string identifier of the model to be instantiated from a
+            pretrained model.
+        args ([`GKDConfig`], *optional*):
+            Training arguments.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            Data collator to batch samples from the dataset. It defaults to a [`DataCollatorForChatML`] using the
+            `processing_class`.
+        train_dataset ([`~datasets.Dataset`], *optional*):
+            Dataset for training.
+        eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
+            Dataset for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+           Class to process the data.
+        compute_metrics (`Callable`, *optional*):
+            Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
+            dictionary string to float.
+        callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+            Callbacks to use during training.
+        optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+            Tuple containing the optimizer and the learning rate scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable`, *optional*):
+            Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
+            return the logits to be used for metrics computation.
+        peft_config ([`~peft.config.PeftConfig`], *optional*):
+            PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
+            wrapped with the specified PEFT adapter.
+        formatting_func (`Callable`, *optional*):
+            Function to format the dataset. Must take in an example and return an example.
+    """
+
+    _tag_names = ["trl", "gkd"]
+
+    def __init__(
+        self,
+        model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
+        teacher_model: Union[PreTrainedModel, nn.Module, str] = None,
+        args: Optional[GKDConfig] = None,
+        data_collator: Optional[DataCollator] = None,  # type: ignore
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional["PeftConfig"] = None,
+        formatting_func: Optional[Callable] = None,
+    ):
+        # Ensure Trainer does not drop non-signature columns used by the collator [e.g., "prompts"]
+        args.remove_unused_columns = False
+        # Respect a user-provided data_collator; otherwise, provide a ChatML collator that
+        if data_collator is None:
+            data_collator = DataCollatorForChatML(tokenizer=processing_class, max_length=args.max_length)
+
+        # Ensure SFTTrainer does not pre-process the dataset when using a ChatML collator,
+        # so that raw conversational fields [e.g., "messages"] remain available to the collator.
+        if args.dataset_kwargs is None:
+            args.dataset_kwargs = {"skip_prepare_dataset": True}
+        else:
+            args.dataset_kwargs["skip_prepare_dataset"] = True
+
+        # Liger fused GKD loss [JSD]
+        self.use_liger_gkd_loss = False
+        if args.use_liger_kernel:
+            self.liger_jsd_loss = LigerFusedLinearJSDLoss(
+                beta=args.beta,
+                ignore_index=-100,
+                temperature=args.temperature,
+                compiled=False,
+            )
+            self.use_liger_gkd_loss = True
+
+        super().__init__(
+            model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+            peft_config=peft_config,
+            formatting_func=formatting_func,
+        )
+
+        if args.teacher_model_init_kwargs is None:
+            teacher_model_init_kwargs = {}
+        elif not isinstance(teacher_model, str):
+            raise ValueError(
+                "You passed teacher_model_init_kwargs to the GKDConfig, but your teacher_model is already instantiated."
+            )
+        else:
+            teacher_model_init_kwargs = args.teacher_model_init_kwargs
+            teacher_model_init_kwargs["dtype"] = (
+                teacher_model_init_kwargs["dtype"]
+                if teacher_model_init_kwargs["dtype"] in ["auto", None]
+                else getattr(torch, teacher_model_init_kwargs["dtype"])
+            )
+
+        if isinstance(teacher_model, str):
+            teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model, **teacher_model_init_kwargs)
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(self.model)
+
+        if self.is_deepspeed_enabled:
+            self.teacher_model = prepare_deepspeed(teacher_model, self.accelerator)
+        else:
+            self.teacher_model = self.accelerator.prepare_model(teacher_model, evaluation_mode=True)
+
+        self.lmbda = args.lmbda
+        self.beta = args.beta
+        self.temperature = args.temperature
+        self.seq_kd = args.seq_kd
+
+        self.generation_config = GenerationConfig(
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            do_sample=True,
+            top_k=0,
+            use_cache=False if args.gradient_checkpointing else True,
+            pad_token_id=self.processing_class.pad_token_id,
+        )
+        # Set custom EOS tokens if they are specified by the model's generation
+        # config. This is important for models with the Llama 3 chat template,
+        # which use special tokens <|eot_id|> and <|eom_id|> to mark the end of
+        # turns or messages.
+        if (
+            hasattr(self.model.generation_config, "eos_token_id")
+            and self.model.generation_config.eos_token_id is not None
+        ):
+            self.generation_config.eos_token_id = self.model.generation_config.eos_token_id
+
+    @staticmethod
+    def generalized_jsd_loss(
+        student_logits, teacher_logits, labels=None, beta=0.5, temperature=1.0, reduction="batchmean"
+    ):
+        """
+        Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1)
+        of https://huggingface.co/papers/2306.13649 for the definition.
+
+        Args:
+            student_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            teacher_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            labels:
+                Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
+                loss
+            beta:
+                Interpolation coefficient between 0 and 1 (default: 0.5)
+            temperature:
+                Softmax temperature (default: 1.0)
+            reduction:
+                Specifies the reduction to apply to the output (default: 'batchmean')
+
+        Returns:
+            loss: Scalar tensor with the generalized JSD loss
+        """
+
+        # Apply temperature scaling
+        student_logits = student_logits / temperature
+        teacher_logits = teacher_logits / temperature
+
+        # Compute log probabilities for student and probabilities for teacher
+        student_log_probs = F.log_softmax(student_logits, dim=-1)
+        teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
+
+        if beta == 0:
+            jsd = F.kl_div(student_log_probs, teacher_log_probs, reduction="none", log_target=True)
+        elif beta == 1:
+            jsd = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True)
+        else:
+            # Compute the log of the mixture distribution
+            # log(a + b) = log(exp(log(a)) + exp(log(b))) -> for mixture
+            beta = torch.tensor(beta, dtype=student_log_probs.dtype)
+            mixture_log_probs = torch.logsumexp(
+                torch.stack([student_log_probs + torch.log(1 - beta), teacher_log_probs + torch.log(beta)]),
+                dim=0,
+            )
+
+            # Compute KL divergences using F.kl_div
+            # PyTorch differs from the standard mathematical definition, so the order of the probability distributions is swapped compared to that defined in the paper.
+            kl_teacher = F.kl_div(mixture_log_probs, teacher_log_probs, reduction="none", log_target=True)
+            kl_student = F.kl_div(mixture_log_probs, student_log_probs, reduction="none", log_target=True)
+
+            # Compute the Generalized Jensen-Shannon Divergence
+            jsd = beta * kl_teacher + (1 - beta) * kl_student
+
+        # Masking
+        if labels is not None:
+            mask = labels != -100
+            jsd = jsd[mask]
+
+        # Apply reduction
+        if reduction == "batchmean":
+            return jsd.sum() / mask.sum() if labels is not None else jsd.sum() / (jsd.size(0) * jsd.size(1))
+        elif reduction == "sum":
+            return jsd.sum()
+        elif reduction == "mean":
+            return jsd.mean()
+        else:
+            return jsd
+
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        if self.use_liger_gkd_loss:
+            # Forward only through the base models (avoid lm_head to save memory)
+            unwrapped_student = self.accelerator.unwrap_model(model)
+            if hasattr(unwrapped_student, "get_decoder") and unwrapped_student.get_decoder() is not None:
+                base_student = unwrapped_student.get_decoder()
+            else:
+                base_student = getattr(
+                    unwrapped_student, getattr(unwrapped_student, "base_model_prefix", "model"), unwrapped_student
+                )
+
+            student_outputs = base_student(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                output_hidden_states=True,
+                use_cache=False,
+            )
+
+            self.teacher_model.eval()
+            unwrapped_teacher = self.accelerator.unwrap_model(self.teacher_model)
+            if hasattr(unwrapped_teacher, "get_decoder") and unwrapped_teacher.get_decoder() is not None:
+                base_teacher = unwrapped_teacher.get_decoder()
+            else:
+                base_teacher = getattr(
+                    unwrapped_teacher, getattr(unwrapped_teacher, "base_model_prefix", "model"), unwrapped_teacher
+                )
+            with torch.no_grad():
+                teacher_outputs = base_teacher(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                    output_hidden_states=True,
+                    use_cache=False,
+                )
+
+            # hidden states (shifted)
+            student_hidden = student_outputs.last_hidden_state[:, :-1].contiguous()
+            teacher_hidden = teacher_outputs.last_hidden_state[:, :-1].contiguous()
+
+            # labels mask and labels (shifted)
+            labels_mask = inputs["labels"] != -100
+            masked_input_ids = torch.where(
+                labels_mask, inputs["input_ids"], torch.full_like(inputs["input_ids"], -100)
+            )
+            true_labels = masked_input_ids[:, 1:].contiguous()
+
+            # heads
+            student_head = unwrapped_student.get_output_embeddings()
+            teacher_head = unwrapped_teacher.get_output_embeddings()
+
+            # liger fused jsd loss
+            loss = self.liger_jsd_loss(
+                student_input=student_hidden,
+                student_weight=student_head.weight,
+                teacher_input=teacher_hidden,
+                teacher_weight=teacher_head.weight,
+                true_labels=true_labels,
+                student_bias=getattr(student_head, "bias", None),
+                teacher_bias=getattr(teacher_head, "bias", None),
+            )
+        else:
+            # compute student output
+            student_outputs = model(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+            )
+
+            # compute teacher output in eval mode
+            self.teacher_model.eval()
+            with torch.no_grad():
+                teacher_outputs = self.teacher_model(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                )
+
+            # slice the logits for the generated tokens using the inputs["prompts"] lengths
+            prompt_lengths = inputs["prompts"].shape[1]
+            shifted_student_logits = student_outputs.logits[:, prompt_lengths - 1 : -1, :]
+            shifted_teacher_logits = teacher_outputs.logits[:, prompt_lengths - 1 : -1, :]
+            shifted_labels = inputs["labels"][:, prompt_lengths:]
+
+            # compute loss
+            loss = self.generalized_jsd_loss(
+                student_logits=shifted_student_logits,
+                teacher_logits=shifted_teacher_logits,
+                labels=shifted_labels,
+                beta=self.beta,
+            )
+
+        # empty cache
+        empty_cache()
+
+        # Return loss
+        return (loss, student_outputs) if return_outputs else loss
+
+    @staticmethod
+    def generate_on_policy_outputs(model, inputs, generation_config, pad_token_id=None):
+        # Generate output with respect to the prompt-only
+        generated_outputs = model.generate(
+            input_ids=inputs["prompts"],
+            attention_mask=inputs.get("prompt_attention_mask", None),
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+        )
+
+        # Get the generated token IDs
+        generated_tokens = generated_outputs.sequences
+        # Calculate new attention mask
+        new_attention_mask = torch.ones_like(generated_tokens)
+        new_labels = generated_tokens.clone()
+
+        # If there's pad_token_id, set attention mask to 0 for padding tokens
+        if pad_token_id is not None:
+            new_labels[new_labels == pad_token_id] = -100
+            new_attention_mask[generated_tokens == pad_token_id] = 0
+
+        return generated_tokens, new_attention_mask, new_labels
+
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Perform a training step for the Generalized Knowledge Distillation (GKD) model.
+
+        This method implements the on-policy learning approach described in the GKD paper. With probability
+        `self.lmbda`, it generates new responses using the student model, which are then used for training instead of
+        the original inputs.
+        """
+        if self.seq_kd:
+            with unwrap_model_for_generation(self.teacher_model, self.accelerator) as unwrapped_model:
+                new_input_ids, new_attention_mask, new_labels = self.generate_on_policy_outputs(
+                    unwrapped_model, inputs, self.generation_config, self.processing_class.pad_token_id
+                )
+            inputs["input_ids"] = new_input_ids
+            inputs["attention_mask"] = new_attention_mask
+            inputs["labels"] = new_labels
+        if random.random() <= self.lmbda:
+            with unwrap_model_for_generation(model, self.accelerator) as unwrapped_model:
+                new_input_ids, new_attention_mask, new_labels = self.generate_on_policy_outputs(
+                    unwrapped_model, inputs, self.generation_config, self.processing_class.pad_token_id
+                )
+            inputs["input_ids"] = new_input_ids
+            inputs["attention_mask"] = new_attention_mask
+            inputs["labels"] = new_labels
+
+        loss = super().training_step(model, inputs, num_items_in_batch)
+        return loss
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @inproceedings{agarwal2024on-policy,
+            title        = {{On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes}},
+            author       = {Rishabh Agarwal and Nino Vieillard and Yongchao Zhou and Piotr Stanczyk and Sabela Ramos Garea and Matthieu Geist and Olivier Bachem},
+            year         = 2024,
+            booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
+            publisher    = {OpenReview.net},
+            url          = {https://openreview.net/forum?id=3zKtaqxLhW},
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="GKD",
+            trainer_citation=citation,
+            paper_title="On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes",
+            paper_id="2306.13649",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothGKDTrainer(_UnslothGKDTrainer):
+    """
+    Trainer for Generalized Knowledge Distillation (GKD) of language models.
+
+For details on GKD, see the paper: [On-Policy Distillation of Language Models: Learning from Self-Generated
+Mistakes](https://huggingface.co/papers/2306.13649).
+
+Args:
+    model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
+        Model to be trained, or the string identifier of the model to be instantiated from a pretrained model.
+    teacher_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
+        Teacher model for knowledge distillation, or the string identifier of the model to be instantiated from a
+        pretrained model.
+    args ([`GKDConfig`], *optional*):
+        Training arguments.
+    data_collator ([`~transformers.DataCollator`], *optional*):
+        Data collator to batch samples from the dataset. It defaults to a [`DataCollatorForChatML`] using the
+        `processing_class`.
+    train_dataset ([`~datasets.Dataset`], *optional*):
+        Dataset for training.
+    eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
+        Dataset for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+       Class to process the data.
+    compute_metrics (`Callable`, *optional*):
+        Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
+        dictionary string to float.
+    callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+        Callbacks to use during training.
+    optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+        Tuple containing the optimizer and the learning rate scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable`, *optional*):
+        Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
+        return the logits to be used for metrics computation.
+    peft_config ([`~peft.config.PeftConfig`], *optional*):
+        PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
+        wrapped with the specified PEFT adapter.
+    formatting_func (`Callable`, *optional*):
+        Function to format the dataset. Must take in an example and return an example.
+
+    """
+    def __init__(
+        self,
+        model = None,
+        teacher_model = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        compute_metrics = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        peft_config = None,
+        formatting_func = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothGKDConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('gkd_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            teacher_model = teacher_model,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            peft_config = peft_config,
+            formatting_func = formatting_func,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
diff --git a/unsloth_compiled_cache/UnslothGRPOTrainer.py b/unsloth_compiled_cache/UnslothGRPOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d5dece359447f8b2b4e4253df228bef8199a9e4
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothGRPOTrainer.py
@@ -0,0 +1,3597 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.grpo_trainer import (Any, AutoConfig, AutoModelForSequenceClassification, AutoProcessor, AutoTokenizer, DataLoader, Dataset, FSDP, GRPOConfig, GRPOTrainer, GenerationConfig, IterableDataset, Optional, Path, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RepeatSampler, RewardFunc, Sampler, SyncRefModelCallback, Trainer, TrainerCallback, Union, VLLMClient, _ForwardRedirection, apply_chat_template, broadcast_object_list, copy, datasets, defaultdict, deque, disable_dropout_in_model, entropy_from_logits, gather, gather_object, generate_model_card, get_comet_experiment_url, identity, inspect, is_conversational, is_datasets_available, is_flash_attn_2_available, is_liger_kernel_available, is_peft_model, is_rich_available, is_vllm_available, is_wandb_available, logger, logging, maybe_apply_chat_template, nanmax, nanmin, nanstd, nn, nullcontext, os, pad, partial, prepare_deepspeed, prepare_fsdp, prepare_multimodal_messages, prepare_peft_model, print_prompt_completions_sample, profiling_context, profiling_decorator, re, seed_worker, selective_log_softmax, set_seed, shuffle_sequence_dict, split_pixel_values_by_grid, split_tensor_dict, textwrap, torch, transformers, truncate_with_protected_tokens, unsplit_pixel_values_by_grid, unwrap_model_for_generation, Any, FSDP, Union, apply_chat_template, broadcast_object_list, copy, gather, gather_object, is_conversational, is_flash_attn_2_available, logging, maybe_apply_chat_template, nanmax, nanmin, nanstd, nullcontext, os, pad, prepare_multimodal_messages, profiling_context, re, torch, transformers, truncate_with_protected_tokens, unwrap_model_for_generation, entropy_from_logits, os, pad, re, selective_log_softmax, torch, transformers, re, Any, Union, profiling_decorator, re, shuffle_sequence_dict, split_pixel_values_by_grid, split_tensor_dict, torch, unsplit_pixel_values_by_grid, Optional, PreTrainedModel, Trainer, logger, os, re, torch, FSDP, nn, os, re, FSDP, nn, re, torch, GRPOTrainer, Trainer, gather, nanmax, nanmin, os, re, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+def grpo_compute_loss(
+    ref_logits,
+    new_logits,
+    old_logits,
+    sampling_per_token_logps,
+    input_ids,
+    mask,
+    beta,
+    advantages,
+    **kwargs
+):
+    # All Unsloth Zoo code licensed under LGPLv3
+    # Set defaults for optional arguments
+    loss_type = kwargs.get("loss_type", "grpo")
+    epsilon_low = kwargs.get("epsilon_low", 0.2)
+    epsilon_high = kwargs.get("epsilon_high", 0.2)
+    max_completion_length = kwargs.get("max_completion_length", 8192)
+    delta = kwargs.get("delta", None)
+    temperature = kwargs.get("temperature", 1.0)
+    logit_scale_multiply = kwargs.get("logit_scale_multiply", 0.0)
+    logit_scale_divide   = kwargs.get("logit_scale_divide", 0.0)
+    logit_softcapping    = kwargs.get("logit_softcapping", 0.0)
+    importance_sampling_level = kwargs.get("importance_sampling_level", "token")
+    num_items_in_batch = kwargs.get("num_items_in_batch", None)
+    current_gradient_accumulation_steps = kwargs.get("current_gradient_accumulation_steps", 1)
+    num_processes = kwargs.get("num_processes", 1)
+    use_vllm = kwargs.get("use_vllm", False)
+    vllm_importance_sampling_cap = kwargs.get("vllm_importance_sampling_cap", 2.0)
+    input_ids = input_ids.unsqueeze(-1)
+
+    # Optional logit softcapping and logit dividing
+    if logit_scale_multiply != 0: new_logits = new_logits * logit_scale_multiply
+    if logit_scale_divide   != 0: new_logits = new_logits / logit_scale_divide
+    if logit_softcapping    != 0: new_logits = new_logits * torch.tanh(new_logits / logit_softcapping)
+
+    new_logits = new_logits.to(torch.float32)
+    # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+    if temperature != 1.0: new_logits = new_logits / temperature
+    new_x = torch.gather(new_logits, dim = -1, index = input_ids).squeeze(-1)
+    new = new_x - torch.logsumexp(new_logits, dim = -1)
+    # x_i - logsumexp(x_i)
+    with torch.no_grad():
+        if beta != 0.0:
+            assert ref_logits is not None, "ref_logits should not be None when beta != 0.0"
+            
+            # Optional logit softcapping and logit dividing
+            if logit_scale_multiply != 0: ref_logits = ref_logits * logit_scale_multiply
+            if logit_scale_divide   != 0: ref_logits = ref_logits / logit_scale_divide
+            if logit_softcapping    != 0: ref_logits = ref_logits * torch.tanh(ref_logits / logit_softcapping)
+
+            ref_logits = ref_logits.to(torch.float32)
+            # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+            if temperature != 1.0: ref_logits = ref_logits / temperature
+            ref_x = torch.gather(ref_logits, dim = -1, index = input_ids).squeeze(-1)
+            ref = ref_x - torch.logsumexp(ref_logits, dim = -1)
+        pass
+
+        if old_logits is not None:
+            # Optional logit softcapping and logit dividing
+            if logit_scale_multiply != 0: old_logits = old_logits * logit_scale_multiply
+            if logit_scale_divide   != 0: old_logits = old_logits / logit_scale_divide
+            if logit_softcapping    != 0: old_logits = old_logits * torch.tanh(old_logits / logit_softcapping)
+
+            old_logits = old_logits.to(torch.float32)
+            # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+            if temperature != 1.0: old_logits = old_logits / temperature
+            old_x = torch.gather(old_logits, dim = -1, index = input_ids).squeeze(-1)
+            old = old_x - torch.logsumexp(old_logits, dim = -1)
+        pass
+        if use_vllm and sampling_per_token_logps is not None:
+            #must filter out extra prompt tokens in begining after making input_ids left padded
+            importance_sampling_ratio = torch.exp((old * mask) - sampling_per_token_logps)
+            importance_sampling_ratio = torch.clamp(
+                importance_sampling_ratio, max=vllm_importance_sampling_cap
+            )
+        pass
+    pass
+    
+    # Reverse KL
+    # Note that this is a low variance low bias estimator for the KL divergence as used in GRPO paper
+    if beta != 0.0:
+        kl_i = torch.exp(ref - new) - (ref - new) - 1.0
+
+    else:
+        # set kl_i to a tensor of zeros with the correct shape
+        if importance_sampling_level == "sequence":
+            kl_i = new.new_zeros(new.size(0), 1)
+        else:
+            kl_i = torch.zeros_like(new)
+    # Full correct reverse KL divergence?? Missing term maybe?
+    # kl_i = torch.exp(new) * kl_i
+
+    # Below is forward KL (normal KL)
+    # kl_i = torch.exp(old) * (old - new)
+    if old_logits is not None: 
+        log_ratio = new - old
+    else:
+        log_ratio = new - new.detach()
+
+    if importance_sampling_level == "token":
+        log_importance_weights = log_ratio
+    elif importance_sampling_level == "sequence":
+        log_importance_weights = (log_ratio * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)
+        log_importance_weights = log_importance_weights.unsqueeze(-1)
+    else:
+        raise ValueError(
+            f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' "
+            "and 'sequence'."
+        )
+
+    coef_1 =  torch.exp(log_importance_weights)
+
+    coef_2 = torch.clamp(coef_1, 1 - epsilon_low, 1 + epsilon_high)
+
+    if delta is not None:
+        loss_1 = torch.clamp(coef_1, max=delta) * advantages.unsqueeze(1)
+    else:
+        loss_1 = coef_1 * advantages.unsqueeze(1)
+    pass
+
+    # Must detach - otherwise gradients are not propagated correctly!
+    # exp(x - x) == 1
+    # loss_i = torch.exp(new - new.detach()) * advantages.unsqueeze(1)
+
+    loss_2 = coef_2 * advantages.unsqueeze(1)
+    loss_i = -torch.min(loss_1, loss_2)
+
+    if use_vllm and sampling_per_token_logps is not None:
+        loss_i = loss_i * importance_sampling_ratio     
+        #delta for metric
+        with torch.no_grad():
+            delta = torch.abs(old - sampling_per_token_logps)
+            delta = delta * mask
+            flat_is_ratio = importance_sampling_ratio * mask
+    else:
+        delta = torch.tensor([]).detach()
+        flat_is_ratio = torch.tensor([]).detach()
+    if beta != 0.0:
+        loss_i = loss_i + beta * kl_i
+    
+    mask = mask.to(torch.float32)
+    n_mask_per_reward = mask.sum(1)
+
+    # https://github.com/huggingface/trl/blob/e8b8499f1f8d76838155b515e414ee98f757d6d5/trl/trainer/grpo_trainer.py#L1624
+    if loss_type == "grpo":
+        loss = ((loss_i * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean()
+        loss = loss / current_gradient_accumulation_steps
+    elif loss_type == "bnpo":
+        loss = (loss_i * mask).sum() / mask.sum().clamp(min=1.0)
+        loss = loss / current_gradient_accumulation_steps
+    elif loss_type == "dr_grpo":
+        loss = (loss_i * mask).sum() / (loss_i.size(0) * max_completion_length)
+        loss = loss / current_gradient_accumulation_steps
+    elif loss_type == "dapo":
+        normalizer = num_items_in_batch/ num_processes
+        loss = (loss_i * mask).sum() / normalizer
+    else: 
+        raise ValueError(f"Unknown loss type: {loss_type}")
+
+    # loss = (loss_i * mask).sum() / mask.sum()
+
+    # Get metrics as well which are folded
+    def masked_batch_mean(x):
+        with torch.inference_mode():
+            completion_length = n_mask_per_reward.mean()
+            if x.shape[1] == 1:  # when importance_sampling_level == "sequence"
+                return completion_length, x.mean()
+            else:
+                mean_kl_per_reward = (x * mask).sum(1) / n_mask_per_reward
+                mean_kl = mean_kl_per_reward.mean()
+                return completion_length, mean_kl
+    completion_length, mean_kl = masked_batch_mean(kl_i)
+    return loss, completion_length, mean_kl, delta, flat_is_ratio
+
+class UnslothEfficientGRPO(torch.autograd.Function):
+    # All Unsloth Zoo code licensed under LGPLv3
+    @staticmethod
+    def forward(ctx, _new_hidden_states, _old_hidden_states, _ref_hidden_states, _sampling_per_token_logps, lm_head, _input_ids, _mask, _advantages, beta, scaler = None, n_chunks = 1, extra_kwargs=None):
+        if extra_kwargs is None:
+            extra_kwargs = {}
+        def compute_loss(new_hidden_states, old_hidden_states, ref_hidden_states, sampling_per_token_logps, input_ids, mask, advantages, scaling):
+            new_logits = torch.matmul(new_hidden_states.to(lm_head.dtype), lm_head.t())
+            new_logits = new_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred
+            with torch.no_grad():
+                if beta != 0.0:
+                    ref_logits = torch.matmul(ref_hidden_states.to(lm_head.dtype), lm_head.t())
+                    ref_logits = ref_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred 
+                else:
+                    ref_logits = None
+                if old_hidden_states is not None:
+                    old_logits = torch.matmul(old_hidden_states.to(lm_head.dtype), lm_head.t())
+                    old_logits = old_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred 
+                else: 
+                    old_logits = None
+            # if old_hidden_states is not None: 
+            #     old_logits = torch.matmul(old_hidden_states, lm_head.t()) #last logit already excluded
+            #     old_logits = old_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred 
+            # else:
+            #     old_logits = None
+            # unsloth_zoo/rl_replacements.py
+            loss, completion_length, mean_kl, delta, flat_is_ratio = grpo_compute_loss(
+                ref_logits,
+                new_logits,
+                old_logits,
+                sampling_per_token_logps,
+                input_ids,
+                mask,
+                beta,
+                advantages,
+                **extra_kwargs,
+            )
+
+            # Scale loss if needed for mixed precision training
+            scaled_loss = loss * scaling
+            # Must add .loss.detach otherwise autograd uses 2x VRAM
+            return scaled_loss, (loss.detach(), completion_length, mean_kl, delta, flat_is_ratio)
+        pass
+
+        device =_new_hidden_states.device
+        grad_inputs = torch.empty_like(_new_hidden_states)
+        accumulated_loss              = torch.zeros(1, device = device)
+        accumulated_completion_length = torch.zeros(1, device = device)
+        accumulated_mean_kl           = torch.zeros(1, device = device)
+        accumulated_delta             = []
+        accumulated_flat_is_ratio     = []
+        def accumulate_chunk(
+            new_hidden_states_j,
+            old_hidden_states_j,
+            ref_hidden_states_j,
+            sampling_per_token_logps_j,
+            input_ids_j,
+            mask_j,
+            advantages_j,
+            scaling,
+            grad_inputs_j,
+        ):
+            (chunk_grad_input,), (chunk_loss, (unscaled_loss, chunk_completion_length, chunk_mean_kl, chunk_delta, chunk_flat_is_ratio)) = torch.func.grad_and_value(
+                compute_loss,
+                argnums = (0,),
+                has_aux = True,
+            )(new_hidden_states_j, old_hidden_states_j, ref_hidden_states_j, sampling_per_token_logps_j, input_ids_j, mask_j, advantages_j, scaling)
+            accumulated_loss             .add_(unscaled_loss)
+            accumulated_completion_length.add_(chunk_completion_length)
+            accumulated_mean_kl          .add_(chunk_mean_kl)
+            accumulated_delta            .append(chunk_delta)
+            accumulated_flat_is_ratio    .append(chunk_flat_is_ratio)
+            grad_inputs_j[:] = chunk_grad_input
+        pass
+
+        accumulate_chunk = torch.compile(
+            accumulate_chunk,
+            fullgraph = True,
+            # [TODO] Dynamic marking causes torch.compile errors if sequence length is long
+            dynamic = True,
+            options = torch_compile_options,
+        )
+
+        grad_inputs_chunks = torch.chunk(grad_inputs,        chunks = n_chunks, dim = 0)
+        new_hidden_states  = torch.chunk(_new_hidden_states, chunks = n_chunks, dim = 0)
+        if _old_hidden_states is not None: 
+            old_hidden_states  = torch.chunk(_old_hidden_states, chunks = n_chunks, dim = 0)
+        else: 
+            old_hidden_states = [None] * n_chunks
+        if _ref_hidden_states is not None: 
+            ref_hidden_states  = torch.chunk(_ref_hidden_states, chunks = n_chunks, dim = 0)
+        else: 
+            ref_hidden_states = [None] * n_chunks
+        if _sampling_per_token_logps is not None: 
+            sampling_per_token_logps  = torch.chunk(_sampling_per_token_logps, chunks = n_chunks, dim = 0)
+        else:
+            sampling_per_token_logps = [None] * n_chunks
+        input_ids          = torch.chunk(_input_ids,         chunks = n_chunks, dim = 0)
+        mask               = torch.chunk(_mask,              chunks = n_chunks, dim = 0)
+        advantages         = torch.chunk(_advantages,        chunks = n_chunks, dim = 0)
+
+        # Get mixed precision scaling if seen
+        scaling = scaler.get_scale() if scaler is not None else 1.0
+
+        # Force torch.compile to use dynamic shapes for seqlen dim
+        # mark_dynamic = lambda x: torch._dynamo.mark_dynamic(x, 1)
+
+        for (grad_inputs_j, new_hidden_states_j, old_hidden_states_j, ref_hidden_states_j, sampling_per_token_logps_j, input_ids_j, mask_j, advantages_j, ) in \
+            zip(grad_inputs_chunks, new_hidden_states, old_hidden_states, ref_hidden_states, sampling_per_token_logps, input_ids, mask, advantages):
+
+            # [TODO] Dynamic marking causes torch.compile errors if sequence length is long
+
+            # mark_dynamic(new_hidden_states_j)
+            # mark_dynamic(ref_hidden_states_j)
+            # if old_hidden_states_j is not None: 
+            #     mark_dynamic(old_hidden_states_j)
+            # mark_dynamic(input_ids_j)
+            # mark_dynamic(mask_j)
+            
+            accumulate_chunk(
+                new_hidden_states_j,
+                old_hidden_states_j,
+                ref_hidden_states_j,
+                sampling_per_token_logps_j,
+                input_ids_j,
+                mask_j,
+                advantages_j,
+                scaling,
+                grad_inputs_j,
+            )
+        pass
+
+        grad_inputs                  .div_(n_chunks)
+        accumulated_loss             .div_(n_chunks)
+        accumulated_completion_length.div_(n_chunks)
+        accumulated_mean_kl          .div_(n_chunks)
+
+        if _sampling_per_token_logps is not None:
+            accumulated_delta = torch.cat(accumulated_delta, dim=0)
+            accumulated_flat_is_ratio = torch.cat(accumulated_flat_is_ratio, dim=0)
+        else:
+            accumulated_delta = None
+            accumulated_flat_is_ratio = None
+        ctx.save_for_backward(grad_inputs)
+        return (
+            accumulated_loss,
+            accumulated_completion_length,
+            accumulated_mean_kl,
+            accumulated_delta,
+            accumulated_flat_is_ratio
+        )
+    pass
+
+    @staticmethod
+    def backward(ctx, grad_output, dcompletion_length, dmean_kl, ddelta, ddflat_is_ratio):
+        (grad_input,) = ctx.saved_tensors
+        return (grad_input, None, None, None, None, None, None, None, None, None, None, None)
+    pass
+
+def grpo_accumulated_loss(
+    trainer,
+    input_ids,
+    attention_mask,
+    logits_to_keep,
+    completion_mask,
+    advantages,
+    old_hidden_states,
+    ref_hidden_states, 
+    n_chunks = -1,
+    **kwargs,
+):
+    # All Unsloth Zoo code licensed under LGPLv3
+    bsz, qlen = input_ids.shape
+
+    pixel_values = kwargs.get('pixel_values',None)
+    image_grid_thw = kwargs.get('image_grid_thw',None)
+    pixel_attention_mask = kwargs.get('pixel_attention_mask',None)
+    image_sizes = kwargs.get('image_sizes',None)
+    sampling_per_token_logps = kwargs.get("sampling_per_token_logps", None)
+    #delete this from kwargs so less issues 
+    del kwargs["sampling_per_token_logps"]
+    kwargs["vllm_importance_sampling_cap"] = trainer.vllm_importance_sampling_cap if sampling_per_token_logps is not None else None
+    kwargs["use_vllm"] = trainer.use_vllm
+    # Find closest multiple
+    factors = [i for i in range(1, bsz + 1) if bsz % i == 0]
+    if n_chunks == -1: n_chunks = bsz
+    n_chunks = factors[min(np.searchsorted(factors, n_chunks), len(factors)-1)]
+
+    if not hasattr(trainer, '_autocast_dtype'):
+        trainer._autocast_dtype = torch.float16 if os.environ.get('ACCELERATE_MIXED_PRECISION', 'fp16') == 'fp16' else torch.bfloat16
+        if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': trainer._autocast_dtype = None
+    pass
+    os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1"
+
+    lm_head = trainer.model.get_output_embeddings().weight
+
+    if pixel_values is None:
+        left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt(input_ids, logits_to_keep, trainer.processing_class.pad_token_id)
+
+        max_left_pad = max(left_pad_tokens_per_prompt).item()
+
+        input_ids = left_pack_padding(input_ids, trainer.processing_class.pad_token_id)
+
+        completion_input_ids = input_ids[:, -(logits_to_keep +max_left_pad):]
+
+        completion_mask = create_completion_attention_mask(completion_input_ids, left_pad_tokens_per_prompt, max_left_pad, trainer.processing_class.pad_token_id).to(attention_mask.dtype)
+        #TODO given the completion mask here we need to, handle the left pad tokens so the sizes of completion
+        #token or old logprobs are compatible with the importance sampling logprobs
+        if trainer.use_vllm and sampling_per_token_logps is not None:
+            sampling_per_token_logps = align_logprobs_with_mask(sampling_per_token_logps, completion_mask)
+        attention_mask =  input_ids != trainer.processing_class.pad_token_id
+        attention_mask = attention_mask.to(attention_mask.dtype)
+    else: 
+        completion_input_ids = input_ids[:, -logits_to_keep:]
+    
+    unwrapped_model = trainer.accelerator.unwrap_model(trainer.model, keep_fp32_wrapper = False)
+
+    # Do not move hidden_states from device 1 to device 0:
+    for module in unwrapped_model.modules():
+        if hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "io_same_decice"):
+            module._hf_hook.io_same_decice = False
+    pass
+    # Get autocaster
+    if trainer._autocast_dtype is None:
+        autocaster = nullcontext()
+    else:
+        autocaster = torch.amp.autocast(device_type = trainer.model.device.type, dtype = trainer._autocast_dtype)
+    with autocaster:
+        if pixel_values is None:
+            new_hidden_states = unwrapped_model(
+                input_ids = input_ids,
+                attention_mask = attention_mask,
+                pixel_values = pixel_values,
+                image_grid_thw = image_grid_thw,
+                pixel_attention_mask = pixel_attention_mask,
+                image_sizes = image_sizes,
+                # logits_to_keep = logits_to_keep + 1,
+            ).logits
+
+            #keep extra logit as we generated a new token
+            new_hidden_states = new_hidden_states[:, -(logits_to_keep +max_left_pad+1): , :]
+            if ref_hidden_states is not None: 
+                ref_hidden_states = ref_hidden_states[:, -(logits_to_keep +max_left_pad+1): , :]
+            if old_hidden_states is not None: 
+                old_hidden_states = old_hidden_states[:, -(logits_to_keep +max_left_pad+1): , :]
+        else: 
+            new_hidden_states = unwrapped_model(
+                input_ids = input_ids,
+                attention_mask = attention_mask,
+                pixel_values = pixel_values,
+                image_grid_thw = image_grid_thw,
+                pixel_attention_mask = pixel_attention_mask,
+                image_sizes = image_sizes,
+                logits_to_keep = logits_to_keep + 1,
+            ).logits
+    loss, completion_length, mean_kl, delta, flat_is_ratio = UnslothEfficientGRPO.apply(
+        new_hidden_states,
+        old_hidden_states,
+        ref_hidden_states,
+        sampling_per_token_logps,
+        lm_head,
+        completion_input_ids,
+        completion_mask,
+        advantages,
+        trainer.beta,
+        trainer.accelerator.scaler,
+        n_chunks,
+        kwargs # pass kwargs as a dict
+    )
+
+    
+    # Must force not returning hidden states but logits otherwise gibberish
+    os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "0"
+
+    return loss, completion_length, mean_kl, delta, flat_is_ratio
+    # Old non efficient code path
+    new_logits = torch.matmul(new_hidden_states, lm_head.t())
+    new_logits = new_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred
+    old_logits = torch.matmul(old_hidden_states, lm_head.t())
+    old_logits = old_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred
+    loss, completion_length, mean_kl = grpo_compute_loss(
+        old_logits,
+        new_logits,
+        completion_input_ids,
+        completion_mask,
+        trainer.beta,
+        advantages,
+    )
+    return loss, completion_length, mean_kl
+    pass
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options)
+def grpo_compute_loss_slow(
+    ref_logits,
+    new_logits,
+    old_logits,
+    sampling_per_token_logps,
+    input_ids,
+    mask,
+    beta,
+    advantages,
+    **kwargs
+):
+    # All Unsloth Zoo code licensed under LGPLv3
+    # Set defaults for optional arguments
+    loss_type = kwargs.get("loss_type", "grpo")
+    epsilon_low = kwargs.get("epsilon_low", 0.2)
+    epsilon_high = kwargs.get("epsilon_high", 0.2)
+    max_completion_length = kwargs.get("max_completion_length", 8192)
+    delta = kwargs.get("delta", None)
+    temperature = kwargs.get("temperature", 1.0)
+    logit_scale_multiply = kwargs.get("logit_scale_multiply", 0.0)
+    logit_scale_divide   = kwargs.get("logit_scale_divide", 0.0)
+    logit_softcapping    = kwargs.get("logit_softcapping", 0.0)
+    importance_sampling_level = kwargs.get("importance_sampling_level", "token")
+    num_items_in_batch = kwargs.get("num_items_in_batch", None)
+    current_gradient_accumulation_steps = kwargs.get("current_gradient_accumulation_steps", 1)
+    num_processes = kwargs.get("num_processes", 1)
+    use_vllm = kwargs.get("use_vllm", False)
+    vllm_importance_sampling_cap = kwargs.get("vllm_importance_sampling_cap", 2.0)
+    input_ids = input_ids.unsqueeze(-1)
+
+    # Optional logit softcapping and logit dividing
+    if logit_scale_multiply != 0: new_logits = new_logits * logit_scale_multiply
+    if logit_scale_divide   != 0: new_logits = new_logits / logit_scale_divide
+    if logit_softcapping    != 0: new_logits = new_logits * torch.tanh(new_logits / logit_softcapping)
+
+    new_logits = new_logits.to(torch.float32)
+    # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+    if temperature != 1.0: new_logits = new_logits / temperature
+    new_x = torch.gather(new_logits, dim = -1, index = input_ids).squeeze(-1)
+    new = new_x - torch.logsumexp(new_logits, dim = -1)
+    # x_i - logsumexp(x_i)
+    with torch.no_grad():
+        if beta != 0.0:
+            assert ref_logits is not None, "ref_logits should not be None when beta != 0.0"
+            
+            # Optional logit softcapping and logit dividing
+            if logit_scale_multiply != 0: ref_logits = ref_logits * logit_scale_multiply
+            if logit_scale_divide   != 0: ref_logits = ref_logits / logit_scale_divide
+            if logit_softcapping    != 0: ref_logits = ref_logits * torch.tanh(ref_logits / logit_softcapping)
+
+            ref_logits = ref_logits.to(torch.float32)
+            # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+            if temperature != 1.0: ref_logits = ref_logits / temperature
+            ref_x = torch.gather(ref_logits, dim = -1, index = input_ids).squeeze(-1)
+            ref = ref_x - torch.logsumexp(ref_logits, dim = -1)
+        pass
+
+        if old_logits is not None:
+            # Optional logit softcapping and logit dividing
+            if logit_scale_multiply != 0: old_logits = old_logits * logit_scale_multiply
+            if logit_scale_divide   != 0: old_logits = old_logits / logit_scale_divide
+            if logit_softcapping    != 0: old_logits = old_logits * torch.tanh(old_logits / logit_softcapping)
+
+            old_logits = old_logits.to(torch.float32)
+            # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+            if temperature != 1.0: old_logits = old_logits / temperature
+            old_x = torch.gather(old_logits, dim = -1, index = input_ids).squeeze(-1)
+            old = old_x - torch.logsumexp(old_logits, dim = -1)
+        pass
+        if use_vllm and sampling_per_token_logps is not None:
+            #must filter out extra prompt tokens in begining after making input_ids left padded
+            importance_sampling_ratio = torch.exp((old * mask) - sampling_per_token_logps)
+            importance_sampling_ratio = torch.clamp(
+                importance_sampling_ratio, max=vllm_importance_sampling_cap
+            )
+        pass
+    pass
+    
+    # Reverse KL
+    # Note that this is a low variance low bias estimator for the KL divergence as used in GRPO paper
+    if beta != 0.0:
+        kl_i = torch.exp(ref - new) - (ref - new) - 1.0
+
+    else:
+        # set kl_i to a tensor of zeros with the correct shape
+        if importance_sampling_level == "sequence":
+            kl_i = new.new_zeros(new.size(0), 1)
+        else:
+            kl_i = torch.zeros_like(new)
+    # Full correct reverse KL divergence?? Missing term maybe?
+    # kl_i = torch.exp(new) * kl_i
+
+    # Below is forward KL (normal KL)
+    # kl_i = torch.exp(old) * (old - new)
+    if old_logits is not None: 
+        log_ratio = new - old
+    else:
+        log_ratio = new - new.detach()
+
+    if importance_sampling_level == "token":
+        log_importance_weights = log_ratio
+    elif importance_sampling_level == "sequence":
+        log_importance_weights = (log_ratio * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)
+        log_importance_weights = log_importance_weights.unsqueeze(-1)
+    else:
+        raise ValueError(
+            f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' "
+            "and 'sequence'."
+        )
+
+    coef_1 =  torch.exp(log_importance_weights)
+
+    coef_2 = torch.clamp(coef_1, 1 - epsilon_low, 1 + epsilon_high)
+
+    if delta is not None:
+        loss_1 = torch.clamp(coef_1, max=delta) * advantages.unsqueeze(1)
+    else:
+        loss_1 = coef_1 * advantages.unsqueeze(1)
+    pass
+
+    # Must detach - otherwise gradients are not propagated correctly!
+    # exp(x - x) == 1
+    # loss_i = torch.exp(new - new.detach()) * advantages.unsqueeze(1)
+
+    loss_2 = coef_2 * advantages.unsqueeze(1)
+    loss_i = -torch.min(loss_1, loss_2)
+
+    if use_vllm and sampling_per_token_logps is not None:
+        loss_i = loss_i * importance_sampling_ratio     
+        #delta for metric
+        with torch.no_grad():
+            delta = torch.abs(old - sampling_per_token_logps)
+            delta = delta * mask
+            flat_is_ratio = importance_sampling_ratio * mask
+    else:
+        delta = torch.tensor([]).detach()
+        flat_is_ratio = torch.tensor([]).detach()
+    if beta != 0.0:
+        loss_i = loss_i + beta * kl_i
+    
+    mask = mask.to(torch.float32)
+    n_mask_per_reward = mask.sum(1)
+
+    # https://github.com/huggingface/trl/blob/e8b8499f1f8d76838155b515e414ee98f757d6d5/trl/trainer/grpo_trainer.py#L1624
+    if loss_type == "grpo":
+        loss = ((loss_i * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean()
+        loss = loss / current_gradient_accumulation_steps
+    elif loss_type == "bnpo":
+        loss = (loss_i * mask).sum() / mask.sum().clamp(min=1.0)
+        loss = loss / current_gradient_accumulation_steps
+    elif loss_type == "dr_grpo":
+        loss = (loss_i * mask).sum() / (loss_i.size(0) * max_completion_length)
+        loss = loss / current_gradient_accumulation_steps
+    elif loss_type == "dapo":
+        normalizer = num_items_in_batch/ num_processes
+        loss = (loss_i * mask).sum() / normalizer
+    else: 
+        raise ValueError(f"Unknown loss type: {loss_type}")
+
+    # loss = (loss_i * mask).sum() / mask.sum()
+
+    # Get metrics as well which are folded
+    def masked_batch_mean(x):
+        with torch.inference_mode():
+            completion_length = n_mask_per_reward.mean()
+            if x.shape[1] == 1:  # when importance_sampling_level == "sequence"
+                return completion_length, x.mean()
+            else:
+                mean_kl_per_reward = (x * mask).sum(1) / n_mask_per_reward
+                mean_kl = mean_kl_per_reward.mean()
+                return completion_length, mean_kl
+    completion_length, mean_kl = masked_batch_mean(kl_i)
+    return loss, completion_length, mean_kl, delta, flat_is_ratio
+
+def vLLMSamplingParams(**kwargs):
+    from vllm import SamplingParams
+    sampling_params = SamplingParams(**kwargs)
+    sampling_params._set_kwargs = kwargs
+    return sampling_params
+@dataclass
+class UnslothGRPOConfig(GRPOConfig):
+    """
+    
+Configuration class for the [`GRPOTrainer`].
+
+This class includes only the parameters that are specific to GRPO training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    > Parameters that control the model and reference model
+
+    model_init_kwargs (`str`, `dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
+        argument of the [`GRPOTrainer`] is provided as a string.
+    disable_dropout (`bool`, *optional*, defaults to `False`):
+        Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents
+        the model from generating different logprobs for the same input.
+
+    > Parameters that control the data preprocessing
+
+    remove_unused_columns (`bool`, *optional*, defaults to `False`):
+        Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that
+        requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`.
+    max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+        Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left.
+    num_generations (`int` or `None`, *optional*, defaults to `8`):
+        Number of generations per prompt to sample. The effective batch size (num_processes * per_device_batch_size
+        * gradient_accumulation_steps) must be evenly divisible by this value.
+    max_completion_length (`int` or `None`, *optional*, defaults to `256`):
+        Maximum length of the generated completion.
+    ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+        This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+        improving generation speed. However, disabling this option allows training models that exceed the VRAM
+        capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
+        with vLLM generation.
+    shuffle_dataset (`bool`, *optional*, defaults to `True`):
+        Whether to shuffle the training dataset.
+
+    > Parameters that control generation
+
+    generation_batch_size: (`int` or `None`, *optional*, defaults to `None`):
+        Batch size to use for generation. If `None`, it defaults to the effective training batch size:
+        `per_device_train_batch_size * num_processes * steps_per_generation`. In other words, there is one
+        generation batch processed per optimization step. Mutually exclusive with `steps_per_generation`.
+    steps_per_generation: (`int` or `None`, *optional*, defaults to `None`):
+        Number of steps per generation. If `None`, it defaults to `gradient_accumulation_steps`. Mutually exclusive
+        with `generation_batch_size`.
+    temperature (`float`, defaults to `1.0`):
+        Temperature for sampling. The higher the temperature, the more random the completions.
+    top_p (`float`, *optional*, defaults to `1.0`):
+        Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
+        `1.0` to consider all tokens.
+    top_k (`int` or `None`, *optional*, defaults to `None`):
+        Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is
+        disabled and all tokens are considered.
+    min_p (`float` or `None`, *optional*, defaults to `None`):
+        Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
+        value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range.
+    repetition_penalty (`float`, *optional*, defaults to `1.0`):
+        Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
+        Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat
+        tokens.
+    use_transformers_paged (`bool`, *optional*, defaults to `False`):
+        Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers`
+        paged implementation will be used for generation instead of the default padded implementation. This
+        parameter is only effective when `use_vllm` is set to `False`.
+    cache_implementation (`str` or `None`, *optional*, defaults to `None`):
+        Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
+    generation_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or `SamplingParams` (if
+        using vLLM) when sampling completions. This can be used to further customize the generation behavior, such
+        as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that conflict with the other generation
+        parameters (like `min_p`, `top_p`, etc.), they will override them.
+
+    > Parameters that control generation acceleration powered by vLLM
+
+    use_vllm (`bool`, *optional*, defaults to `False`):
+        Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation
+        instead of the default model.generate(). Requires `vllm` to be installed.
+    vllm_mode (`str`, *optional*, defaults to `"server"`):
+        Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or
+        `"colocate"`.
+
+        - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
+          server is running (start with `trl vllm-serve`).
+        - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a
+          separate server but may cause resource contention with training.
+    vllm_model_impl (`str`, *optional*, defaults to `"vllm"`):
+        Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use
+        the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model
+        implementation.
+    vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
+        Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.
+
+    > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)
+
+    vllm_server_base_url (`str` or `None`, *optional*, defaults to `None`):
+        Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and
+        `vllm_server_port` are ignored.
+    vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
+        Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+    vllm_server_port (`int`, *optional*, defaults to `8000`):
+        Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+    vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
+        Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
+        timeout, a `ConnectionError` is raised.
+
+    > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)
+
+    vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.3`):
+        Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to
+        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+        launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.
+    vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
+        Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to
+        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+        launching the vLLM server via the `--vllm_tensor_parallel_size` flag.
+    vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`):
+        Whether to enable sleep mode for vLLM. If `True`, vLLM will sleep during the optimization step and woken
+        for weight sync and generation.
+
+    > Parameters that control the training
+
+    beta (`float`, *optional*, defaults to `0.0`):
+        KL coefficient. If `0.0` (default), the reference model is not loaded, reducing memory usage and improving
+        training speed.
+    num_iterations (`int`, *optional*, defaults to `1`):
+        Number of iterations per batch (denoted as μ in the algorithm).
+    epsilon (`float`, *optional*, defaults to `0.2`):
+        Epsilon value for clipping.
+    delta (`float` or `None`, *optional*, defaults to `None`):
+        Enables the upper clipping bound in two-sided GRPO loss when set to a float. If `None` (default), standard
+        GRPO clipping is used. Recommended to be greater than `1 + ε` when enabled. This method is introduced in
+        the [INTELLECT-2 tech report](https://huggingface.co/papers/2505.07291).
+    epsilon_high (`float` or `None`, *optional*, defaults to `None`):
+        Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound
+        specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`.
+    importance_sampling_level (`str`, *optional*, defaults to `"token"`):
+        Controls whether importance sampling ratios are computed at the `"token"` or `"sequence"` level. `"token"`
+        keeps the raw per-token log-probability ratios (one weight per token). `"sequence"` averages the
+        log-probability ratios across valid tokens to produce a single ratio per sequence. The [GSPO
+        paper](https://huggingface.co/papers/2507.18071) shows that sequence-level sampling often yields more
+        stable training and better alignment with sequence-level rewards.
+    reward_weights (`list[float]` or `None`, *optional*, defaults to `None`):
+        Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are
+        weighted equally with weight `1.0`.
+    scale_rewards (`str` or `bool`, *optional*, defaults to `"group"`):
+        Specifies the scaling strategy for rewards. Supported values are:
+
+        - `True` or `"group"` (default): rewards are scaled by the standard deviation within each group, ensuring
+          unit variance within a group.
+        - `"batch"`: rewards are scaled by the standard deviation across the entire batch, as recommended in the
+          [PPO Lite paper](https://huggingface.co/papers/2508.08221).
+        - `False` or `"none"`: no scaling is applied. The [Dr. GRPO
+          paper](https://huggingface.co/papers/2503.20783) recommends not scaling rewards, as scaling by the
+          standard deviation introduces a question-level difficulty bias.
+    loss_type (`str`, *optional*, defaults to `"dapo"`):
+        Specifies the loss formulation to use. Supported values are:
+
+        - `"grpo"`: Aggregates token-level losses by normalizing over sequence length. Not recommended due to
+          length bias—this approach tends to prefer shorter completions with positive advantages and longer ones
+          with negative advantages.
+        - `"dr_grpo"`: Aggregates token-level losses by normalizing with a global constant. This method was
+          introduced in the [Dr. GRPO paper](https://huggingface.co/papers/2503.20783) to eliminate length bias.
+          The value of the constant corresponds to `max_completion_length`.
+        - `"dapo"` (default): Aggregates token-level losses by normalizing with the number of active token in the
+          global accumulated batch. This method was introduced in the [DAPO
+          paper](https://huggingface.co/papers/2503.14476) to eliminate length bias.
+        - `"bnpo"`: Aggregates token-level losses by normalizing with the number of active token in the local
+          batch. Note that normalization is performed over the local batch only, so results may slightly vary
+          depending on the local batch size, despite a constant effective batch size. When using
+          `per_device_train_batch_size==1`, the loss is equivalent to the GRPO loss.
+    mask_truncated_completions (`bool`, *optional*, defaults to `False`):
+        When enabled, truncated completions are excluded from the loss calculation, preventing them from being
+        incorrectly penalized and introducing noise during training. According to the
+        [DAPO](https://huggingface.co/papers/2503.14476) paper, this is a good practice for training stability.
+    sync_ref_model (`bool`, *optional*, defaults to `False`):
+        Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using
+        the `ref_model_mixup_alpha` parameter. This synchronization originates from the
+        [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
+    ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`):
+        α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix
+        between the current policy and the previous reference policy during updates. The reference policy is
+        updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you
+        must set `sync_ref_model=True`.
+    ref_model_sync_steps (`int`, *optional*, defaults to `512`):
+        τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
+        frequently the current policy is synchronized with the reference policy. To use this parameter, you must
+        set `sync_ref_model=True`.
+    top_entropy_quantile (`float`, *optional*, defaults to `1.0`):
+        ρ parameter from [Beyond the 80/20 Rule](https://huggingface.co/papers/2506.01939). Keeps in the policy
+        loss term only the top-ρ quantile of tokens by entropy of the probability distribution at each sequence
+        position, improving results. Range: `[0.0-1.0]`. A value of `0.0` masks all but the highest entropy token;
+        `1.0` keeps all tokens. The paper recommends a value of `0.2`. If used with
+        `mask_truncated_completions=True`, only tokens from non-truncated completions are considered.
+    use_liger_loss (`bool`, *optional*, defaults to `False`):
+        Whether to use the Liger GRPO loss.
+    vllm_importance_sampling_correction (`bool`, *optional*, defaults to `True`):
+        Whether to apply Truncated Importance Sampling (TIS) between vLLM completion logprobs and recomputed
+        logprobs. [Your Efficient RL Framework Secretly Brings You Off-Policy RL
+        Training](https://fengyao.notion.site/off-policy-rl) highlights that using a separate generation framework
+        (such as vLLM) can introduce off-policy effects due to subtle implementation differences between generation
+        and training backends. TIS is proposed as a remedy for this issue.
+    vllm_importance_sampling_cap (`float`, *optional*, defaults to `2.0`):
+        Truncation parameter C for Truncated Importance Sampling (TIS). This sets an upper bound on the importance
+        sampling ratio, improving training stability.
+
+    > Parameters that control the logging
+
+    log_completions (`bool`, *optional*, defaults to `False`):
+        Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is installed,
+        it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`.
+    num_completions_to_print (`int` or `None`, *optional*, defaults to `None`):
+        Number of completions to print with `rich`. If `None`, all completions are logged.
+    wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`):
+        Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts
+        are logged.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = False,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        model_init_kwargs = None,
+        disable_dropout = False,
+        max_prompt_length = 512,
+        num_generations = 8,
+        max_completion_length = 256,
+        ds3_gather_for_generation = True,
+        shuffle_dataset = True,
+        generation_batch_size = None,
+        steps_per_generation = None,
+        temperature = 1.0,
+        top_p = 1.0,
+        top_k = None,
+        min_p = None,
+        generation_kwargs = {},
+        repetition_penalty = 1.0,
+        use_transformers_paged = False,
+        cache_implementation = None,
+        use_vllm = False,
+        vllm_mode = 'colocate',
+        vllm_model_impl = 'vllm',
+        vllm_enable_sleep_mode = False,
+        vllm_guided_decoding_regex = None,
+        vllm_server_base_url = None,
+        vllm_server_host = '0.0.0.0',
+        vllm_server_port = 8000,
+        vllm_server_timeout = 240.0,
+        vllm_gpu_memory_utilization = 0.3,
+        vllm_tensor_parallel_size = 1,
+        beta = 0.001,
+        num_iterations = 1,
+        epsilon = 0.2,
+        delta = None,
+        epsilon_high = None,
+        importance_sampling_level = 'token',
+        reward_weights = None,
+        scale_rewards = 'group',
+        loss_type = 'bnpo',
+        mask_truncated_completions = False,
+        sync_ref_model = False,
+        ref_model_mixup_alpha = 0.6,
+        ref_model_sync_steps = 512,
+        top_entropy_quantile = 1.0,
+        use_liger_loss = False,
+        vllm_importance_sampling_correction = False,
+        vllm_importance_sampling_cap = 2.0,
+        log_completions = False,
+        num_completions_to_print = None,
+        wandb_log_unique_prompts = False,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if loss_type.lower() == 'dr_grpo':
+            loss_type = 'dr_grpo'
+        elif loss_type.lower() == 'dapo':
+            loss_type = 'dapo'
+        if loss_type.lower() == 'dr_grpo':
+            if scale_rewards == None:
+                scale_rewards = True
+            elif scale_rewards == True:
+                print('Unsloth: The Dr GRPO paper recommends setting `scale_rewards` to False! Will override. Set it to `None` to force False.')
+                scale_rewards = False
+        elif loss_type.lower() == 'dapo':
+            if mask_truncated_completions != True:
+                print('Unsloth: The DAPO paper recommends `mask_truncated_completions = True` - we will set it.')
+            if epsilon_high != 0.28:
+                print('Unsloth: The DAPO paper recommends `epsilon_high = 0.28` - we will set it.')
+            if beta != 0.0:
+                print('Unsloth: The DAPO paper recommends setting `beta = 0.0` to remove the KL term - we will set it.')
+            mask_truncated_completions = True
+            epsilon_high = 0.28
+            beta = 0.0
+        
+        if (per_device_train_batch_size // num_generations) * num_generations != per_device_train_batch_size:
+            print('Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.\nWe will change the batch size of ' + str(per_device_train_batch_size) + ' to the `num_generations` of ' + str(num_generations))
+            per_device_train_batch_size = num_generations
+        
+        if temperature <= 0:
+            raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.')
+        elif temperature >= 10:
+            raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.')
+        
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            model_init_kwargs = model_init_kwargs,
+            disable_dropout = disable_dropout,
+            max_prompt_length = max_prompt_length,
+            num_generations = num_generations,
+            max_completion_length = max_completion_length,
+            ds3_gather_for_generation = ds3_gather_for_generation,
+            shuffle_dataset = shuffle_dataset,
+            generation_batch_size = generation_batch_size,
+            steps_per_generation = steps_per_generation,
+            temperature = temperature,
+            top_p = top_p,
+            top_k = top_k,
+            min_p = min_p,
+            generation_kwargs = generation_kwargs,
+            repetition_penalty = repetition_penalty,
+            use_transformers_paged = use_transformers_paged,
+            cache_implementation = cache_implementation,
+            use_vllm = use_vllm,
+            vllm_mode = vllm_mode,
+            vllm_model_impl = vllm_model_impl,
+            vllm_enable_sleep_mode = vllm_enable_sleep_mode,
+            vllm_guided_decoding_regex = vllm_guided_decoding_regex,
+            vllm_server_base_url = vllm_server_base_url,
+            vllm_server_host = vllm_server_host,
+            vllm_server_port = vllm_server_port,
+            vllm_server_timeout = vllm_server_timeout,
+            vllm_gpu_memory_utilization = vllm_gpu_memory_utilization,
+            vllm_tensor_parallel_size = vllm_tensor_parallel_size,
+            beta = beta,
+            num_iterations = num_iterations,
+            epsilon = epsilon,
+            delta = delta,
+            epsilon_high = epsilon_high,
+            importance_sampling_level = importance_sampling_level,
+            reward_weights = reward_weights,
+            scale_rewards = scale_rewards,
+            loss_type = loss_type,
+            mask_truncated_completions = mask_truncated_completions,
+            sync_ref_model = sync_ref_model,
+            ref_model_mixup_alpha = ref_model_mixup_alpha,
+            ref_model_sync_steps = ref_model_sync_steps,
+            top_entropy_quantile = top_entropy_quantile,
+            use_liger_loss = use_liger_loss,
+            vllm_importance_sampling_correction = vllm_importance_sampling_correction,
+            vllm_importance_sampling_cap = vllm_importance_sampling_cap,
+            log_completions = log_completions,
+            num_completions_to_print = num_completions_to_print,
+            wandb_log_unique_prompts = wandb_log_unique_prompts,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        
+pass
+
+class _UnslothGRPOTrainer(Trainer):
+    """
+    Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
+    paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
+    Models](https://huggingface.co/papers/2402.03300).
+
+    Example:
+
+    ```python
+    from datasets import load_dataset
+    from trl import GRPOTrainer
+
+    dataset = load_dataset("trl-lib/tldr", split="train")
+    def reward_func(completions, **kwargs):
+        # Dummy reward function that rewards completions with more unique letters.
+        return [float(len(set(completion))) for completion in completions]
+    trainer = GRPOTrainer(
+        model="Qwen/Qwen2-0.5B-Instruct",
+        reward_funcs=reward_func,
+        train_dataset=dataset,
+    )
+
+    trainer.train()
+    ```
+
+    Args:
+        model (`Union[str, PreTrainedModel]`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
+            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+            functions with the prompts and completions and sum the rewards. Can be either:
+
+            - A single reward function, such as:
+                - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
+                path to a *directory* containing model weights saved using
+                [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+                using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
+                keyword arguments in `args.model_init_kwargs`.
+                - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
+                - A custom reward function: The function is provided with the prompts and the generated completions,
+                  plus any additional columns in the dataset. It should return a list of rewards. Custom reward
+                  functions can also return `None` when the reward is not applicable to those samples. This is useful
+                  for multi-task training where different reward functions apply to different types of samples. When a
+                  reward function returns `None` for a sample, that reward function is excluded from the reward
+                  calculation for that sample. For more details, see [Using a custom reward
+                  function](#using-a-custom-reward-function).
+
+                  The trainer's state is also passed to the reward function. The trainer's state is an instance of
+                  [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
+                  reward function's signature.
+            - A list of reward functions, where each item can independently be any of the above types. Mixing different
+            types within the list (e.g., a string model ID and a custom reward function) is allowed.
+        args ([`GRPOConfig`], *optional*, defaults to `None`):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
+            ignored. The format of the samples can be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
+            Processing class used to process the data. The padding side must be set to "left". If `None`, the
+            processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
+            padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
+            `tokenizer.eos_token` will be used as the default.
+        reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
+            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+            - A single processing class: Used when `reward_funcs` contains only one reward function.
+            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+            If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
+            `None`, the tokenizer for the model is automatically loaded using
+            [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
+            functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
+            are ignored.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    """
+
+    _tag_names = ["trl", "grpo"]
+
+    def __init__(
+        self,
+        model: Union[str, PreTrainedModel],
+        reward_funcs: Union[RewardFunc, list[RewardFunc]],
+        args: Optional[GRPOConfig] = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None,
+        processing_class: Optional[Union[PreTrainedTokenizerBase, ProcessorMixin]] = None,
+        reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
+        peft_config: Optional["PeftConfig"] = None,
+    ):
+
+        if hasattr(model, 'vllm_engine') and hasattr(args, 'use_vllm'):
+            if (getattr(args, 'use_vllm', False) == False):
+                args.use_vllm = True
+            args.vllm_mode='colocate'
+        # Args
+        if args is None:
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = GRPOConfig(f"{model_name}-GRPO")
+
+        # Models
+        # Trained model
+        model_init_kwargs = args.model_init_kwargs or {}
+        if isinstance(model, str):
+            model_id = model
+            dtype = model_init_kwargs.get("dtype")
+            if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+                pass  # dtype is already a torch.dtype or "auto" or None
+            elif isinstance(dtype, str):  # it's a str, but not "auto"
+                dtype = getattr(torch, dtype)
+                model_init_kwargs["dtype"] = dtype
+            else:
+                raise ValueError(
+                    "Invalid `dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
+                    f"a `torch.dtype` (e.g., 'float32'), but got {dtype}."
+                )
+            # Disable caching if gradient checkpointing is enabled [not supported]
+            config = AutoConfig.from_pretrained(model_id)
+            architecture = getattr(transformers, config.architectures[0])
+            model = architecture.from_pretrained(model_id, **model_init_kwargs)
+        else:
+            model_id = model.config._name_or_path
+            if args.model_init_kwargs is not None:
+                logger.warning(
+                    "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. "
+                    "The `model_init_kwargs` will be ignored."
+                )
+
+        # Some models [SmolVLM/Idefics3] don't support `logits_to_keep` argument and error out if we pass it
+        # Inspect the forward method before we wrap the model with PEFT
+        self.model_kwarg_keys = (
+            inspect.signature(model.forward).parameters.keys()
+            if not hasattr(model, "get_base_model")
+            else inspect.signature(model.get_base_model().forward).parameters.keys()
+        )
+
+        if False:
+            model = prepare_peft_model(model, peft_config, args)
+
+        # Processing class
+        if processing_class is None:
+            processing_class = AutoProcessor.from_pretrained(model.config._name_or_path)
+
+        # Handle pad token for processors or tokenizers
+        if isinstance(processing_class, ProcessorMixin):
+            tokenizer = processing_class.tokenizer
+        elif isinstance(processing_class, PreTrainedTokenizerBase):
+            tokenizer = processing_class
+        else:
+            raise TypeError("The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`")
+
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        self.pad_token = tokenizer.pad_token
+        self.pad_token_id = tokenizer.pad_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+        self.image_token = getattr(processing_class, "image_token", None)
+        self.image_token_id = getattr(processing_class, "image_token_id", None)
+        self.vision_start_token_id = getattr(model.config, "vision_start_token_id", None)
+        self.vision_end_token_id = getattr(model.config, "vision_end_token_id", None)
+
+        # Reward functions
+        if not isinstance(reward_funcs, list):
+            reward_funcs = [reward_funcs]
+        self.reward_func_names = []
+        for i, reward_func in enumerate(reward_funcs):
+            if isinstance(reward_func, str):
+                reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                    reward_func, num_labels=1, **model_init_kwargs
+                )
+            if isinstance(reward_funcs[i], nn.Module):  # Use Module over PretrainedModel for compat w/ compiled models
+                self.reward_func_names.append(reward_funcs[i].config._name_or_path.split("/")[-1])
+            else:
+                self.reward_func_names.append(reward_funcs[i].__name__)
+        self.reward_funcs = reward_funcs
+
+        # Reward weights
+        if args.reward_weights is not None:
+            if len(args.reward_weights) != len(reward_funcs):
+                raise ValueError(
+                    f"Number of reward weights ({len(args.reward_weights)}) must match number of reward "
+                    f"functions ({len(reward_funcs)})"
+                )
+            self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32)
+        else:
+            self.reward_weights = torch.ones(len(reward_funcs), dtype=torch.float32)
+
+        # Reward processing class
+        if reward_processing_classes is None:
+            reward_processing_classes = [None] * len(reward_funcs)
+        elif not isinstance(reward_processing_classes, list):
+            reward_processing_classes = [reward_processing_classes]
+        if len(reward_processing_classes) != len(reward_funcs):
+            raise ValueError(
+                f"The number of reward processing classes ({len(reward_processing_classes)}) must match the number of "
+                f"reward functions ({len(reward_funcs)})."
+            )
+
+        for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)):
+            if isinstance(reward_func, PreTrainedModel):
+                if reward_processing_class is None:
+                    reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path)
+                if reward_processing_class.pad_token_id is None:
+                    reward_processing_class.pad_token = reward_processing_class.eos_token
+                # The reward model computes the reward for the latest non-padded token in the input sequence.
+                # So it's important to set the pad token ID to the padding token ID of the processing class.
+                reward_func.config.pad_token_id = reward_processing_class.pad_token_id
+                reward_processing_classes[i] = reward_processing_class
+
+        self.reward_processing_classes = reward_processing_classes
+
+        # Training arguments
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = args.max_completion_length  # = |o_i| in the GRPO paper
+        self.num_generations = args.num_generations  # = G in the GRPO paper
+        self.temperature = args.temperature
+        self.top_p = args.top_p
+        self.top_k = args.top_k
+        self.min_p = args.min_p
+        self.repetition_penalty = args.repetition_penalty
+        self.use_transformers_paged = args.use_transformers_paged
+        self.use_vllm = args.use_vllm
+        self.vllm_mode = args.vllm_mode
+        self.vllm_gpu_memory_utilization = args.vllm_gpu_memory_utilization  # only applies to colocation mode
+        self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size  # only applies to colocation mode
+        self.vllm_importance_sampling_correction = args.vllm_importance_sampling_correction
+        self.vllm_importance_sampling_cap = args.vllm_importance_sampling_cap
+        self.use_liger_loss = args.use_liger_loss
+        self.loss_type = args.loss_type
+        self.scale_rewards = args.scale_rewards
+        self.importance_sampling_level = args.importance_sampling_level
+        self.mask_truncated_completions = args.mask_truncated_completions
+        self.top_entropy_quantile = args.top_entropy_quantile
+        if self.use_liger_loss and self.top_entropy_quantile < 1.0:
+            raise NotImplementedError(
+                "Liger Kernels don't currently support masking token positions based on entropy."
+            )
+        if self.use_liger_loss and not self.importance_sampling_level == "token":
+            raise NotImplementedError(
+                "Liger Kernels currently only support token-level importance sampling. Please set"
+                "`importance_sampling_level` to 'token'."
+            )
+
+        # Datasets
+        self.shuffle_dataset = args.shuffle_dataset
+
+        if (
+            isinstance(train_dataset, IterableDataset)
+            or isinstance(eval_dataset, IterableDataset)
+            or (
+                isinstance(eval_dataset, dict) and any(isinstance(ds, IterableDataset) for ds in eval_dataset.values())
+            )
+        ):
+            # See https://github.com/huggingface/trl/issues/3213
+            raise NotImplementedError(
+                "Iterable datasets are not yet supported in GRPOTrainer. Please use a standard dataset instead."
+            )
+
+        # Multi-step
+        self.num_iterations = args.num_iterations  # = 𝜇 in the GRPO paper
+        self.epsilon_low = args.epsilon
+        self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon
+        # Tracks the number of iterations [forward + backward passes], including those within a grad accum cycle
+        self._step = 0
+        # Buffer the batch to reuse generated outputs across multiple updates. For more details, see
+        # `_get_train_sampler` and `_prepare_inputs`.
+        self._buffered_inputs = None
+
+        # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning:
+        # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To
+        # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True.
+        # This acts as a flag to indicate that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=identity,  # No data collation is needed in GRPO
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            # In Trainer, `training_step` scales the loss by `gradient_accumulation_steps` only if `compute_loss_func`
+            # is None. For DAPO, loss scaling instead depends on the total number of completions tokens across the
+            # global accumulated batch. To control scaling ourselves, we must disable Trainer’s built-in scaling. The
+            # simplest [though a bit hacky] way is to set `compute_loss_func` to any non-None value, which bypasses
+            # that behavior without rewriting `training_step`.
+            compute_loss_func="non-None value to disable scaling",
+        )
+
+        # Reference model
+        self.beta = args.beta
+        if self.beta == 0.0:
+            # If beta is 0.0, the reference model is not needed
+            self.ref_model = None
+        elif is_peft_model(model):
+            # If PEFT is used, the reference model is not needed since the adapter can be disabled
+            # to revert to the initial model.
+            self.ref_model = None
+        else:
+            # For deepspeed, fsdp or non-distributed models, create a reference model from scratch
+            config = AutoConfig.from_pretrained(model_id)
+            architecture = getattr(transformers, config.architectures[0])
+            self.ref_model = architecture.from_pretrained(model_id, **model_init_kwargs)
+
+        # Disable dropout in the models
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        # Liger loss
+        if self.use_liger_loss:
+            if not is_liger_kernel_available():
+                raise ImportError(
+                    "Liger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`."
+                )
+            # redirect the model.module forward to the model forward to ensure pre-forward hooks are called
+            self._forward_redirection = _ForwardRedirection()
+
+            self.liger_grpo_loss = LigerFusedLinearGRPOLoss(
+                beta=self.beta,
+                epsilon_low=self.epsilon_low,
+                epsilon_high=self.epsilon_high,
+                temperature=self.temperature,
+                use_ref_model=self.beta != 0.0,
+                loss_type=self.loss_type,
+                max_completion_length=self.max_completion_length,
+            )
+
+        # Initialize the metrics
+        self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)}
+        self._total_train_tokens = 0
+        self.log_completions = args.log_completions
+        self.wandb_log_unique_prompts = args.wandb_log_unique_prompts
+        self.num_completions_to_print = args.num_completions_to_print
+        # Keep logs sized to the generation batch to record only outputs from the latest model update.
+        self._logs = {
+            "image": deque(maxlen=args.generation_batch_size),
+            "prompt": deque(maxlen=args.generation_batch_size),
+            "completion": deque(maxlen=args.generation_batch_size),
+            "rewards": defaultdict(lambda: deque(maxlen=args.generation_batch_size)),
+            "advantages": deque(maxlen=args.generation_batch_size),
+        }
+
+        # Ensure each process receives a unique seed to prevent duplicate completions when generating with
+        # transformers if num_generations exceeds per_device_train_batch_size. We could skip it if we use vLLM, but
+        # it's safer to set it in all cases.
+        set_seed(args.seed, device_specific=True)
+
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError(
+                    "vLLM is not available and `use_vllm` is set to True. Please install vLLM with "
+                    "`pip install vllm` to use it."
+                )
+
+            if self.vllm_mode == "server":
+                if self.accelerator.is_main_process:
+                    if args.vllm_server_base_url is not None:
+                        base_url = args.vllm_server_base_url
+                    else:
+                        base_url = f"http://{args.vllm_server_host}:{args.vllm_server_port}"
+                    self.vllm_client = VLLMClient(base_url=base_url, connection_timeout=args.vllm_server_timeout)
+                    self.vllm_client.init_communicator(device=torch.cuda.current_device())
+
+            elif self.vllm_mode == "colocate":
+                if not self.accelerator.num_processes % self.vllm_tensor_parallel_size == 0:
+                    raise ValueError(
+                        f"vllm_tensor_parallel_size ({self.vllm_tensor_parallel_size}) must divide world size "
+                        f"({self.accelerator.num_processes}) evenly."
+                    )
+
+                if self.vllm_tensor_parallel_size > 1:
+                    self.tp_group, _ = torch.distributed.new_subgroups_by_enumeration(
+                        [
+                            list(range(i * self.vllm_tensor_parallel_size, (i + 1) * self.vllm_tensor_parallel_size))
+                            for i in range(self.accelerator.num_processes // self.vllm_tensor_parallel_size)
+                        ]
+                    )
+                os.environ["RANK"] = str(self.accelerator.process_index)
+                os.environ["LOCAL_RANK"] = str(self.accelerator.local_process_index)
+                os.environ["WORLD_SIZE"] = str(self.accelerator.num_processes)
+                os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "localhost")
+                os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "12345")
+
+                if self.max_prompt_length is not None and self.max_completion_length is not None:
+                    max_model_len = self.max_prompt_length + self.max_completion_length
+                else:
+                    max_model_len = None
+                self.llm = model.vllm_engine
+                if self.args.vllm_enable_sleep_mode:
+                    self.llm.sleep(level=1)
+            else:
+                raise ValueError(f"vllm_mode must be either 'server' or 'colocate', got '{self.vllm_mode}'.")
+            self.guided_decoding_regex = args.vllm_guided_decoding_regex
+
+            self._last_loaded_step = -1
+            self.accelerator.wait_for_everyone()
+        else:
+            generation_kwargs = {
+                "max_new_tokens": self.max_completion_length,
+                "do_sample": True,
+                "pad_token_id": tokenizer.pad_token_id,
+                "bos_token_id": tokenizer.bos_token_id,
+                "eos_token_id": tokenizer.eos_token_id,
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "top_k": self.top_k,
+                "min_p": self.min_p,
+                "repetition_penalty": self.repetition_penalty,
+                "cache_implementation": args.cache_implementation,
+            }
+            if args.use_transformers_paged:
+                generation_kwargs["max_batch_tokens"] = 512
+                generation_kwargs["num_blocks"] = 1024
+                generation_kwargs["block_size"] = 128
+            if args.generation_kwargs is not None:
+                generation_kwargs.update(args.generation_kwargs)
+            self.generation_config = GenerationConfig(**generation_kwargs)
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags to the model
+        self.model.add_model_tags(self._tag_names)
+
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            elif self.is_fsdp_enabled:
+                self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+
+        if args.sync_ref_model:
+            self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator))
+
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                if self.is_deepspeed_enabled:
+                    self.reward_funcs[i] = prepare_deepspeed(reward_func, self.accelerator)
+                else:
+                    # set device placement to True to make `prepare_model` move `reward_func` to device when using fsdp
+                    self.reward_funcs[i] = self.accelerator.prepare_model(
+                        reward_func, evaluation_mode=True, device_placement=True
+                    )
+
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt", "image"]
+
+    # This method overrides `Trainer.get_train_dataloader` to support our custom batching strategy.
+    # Instead of returning a standard per-step batch (i.e., `per_device_batch_size), our dataloader loads an
+    # *generation* batch (i.e., `per_device_batch_size × steps_per_generation`). This allows us to generate completions
+    # once every steps_per_generation step—rather than once per accumulation step—which is significantly more
+    # efficient. The only change from the original implementation is multiplying the batch size by
+    # `steps_per_generation`. Thus, `_prepare_inputs` is called with this *generation* batch, and it handles the
+    # splitting internally.
+    # Maintenance note: This method is a copy-paste of the original `Trainer.get_train_dataloader` with only one line
+    # modification. As a result, some parts of the method aren't relevant to GRPO, but we keep them to stay one line
+    # apart from the super method, ensuring easier maintenance in the future.
+    def get_train_dataloader(self):
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator
+        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
+            train_dataset = self._remove_unused_columns(train_dataset, description="training")
+        else:
+            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
+
+        dataloader_params = {
+            "batch_size": self._train_batch_size * self.args.steps_per_generation,  # < this is the change
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_train_sampler()
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = partial(
+                seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index
+            )
+
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+
+        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
+
+    def _get_train_sampler(self, dataset: Optional[Dataset] = None) -> Sampler:
+        # Returns a sampler that
+        # 1. ensures each prompt is repeated across multiple processes. This guarantees that identical prompts are
+        #    distributed to different GPUs, allowing rewards to be computed and normalized correctly within each prompt
+        #    group. Using the same seed across processes ensures consistent prompt assignment, preventing discrepancies
+        #    in group formation.
+        # 2. repeats the batch multiple times to allow reusing generations across multiple updates. Refer to
+        #    _prepare_inputs to see how the generations are stored and reused.
+
+        # In the following figure, the values are the prompt indices. The first row shows the first sampled batch, the
+        # second row shows the second sampled batch, and so on.
+        #
+        #                                      |   GPU 0  |   GPU 1  |
+        #
+        #                 global_step   step    <-───>  num_generations=2
+        #                                       <-───────> per_device_train_batch_size=3
+        #  grad_accum    ▲  ▲  0          0     0   0   1   1   2   2   <- Generate for the first `steps_per_generation` (prompts 0 to 11); store the completions; use the first slice to compute the loss
+        #     =2         ▼  |  0          1     3   3   4   4   5   5   <- Take the stored generations and use the second slice to compute the loss
+        #                   |
+        #                   |  1          2     6   6   7   7   8   8   <- Take the stored generations and use the third slice to compute the loss
+        #  steps_per_gen=4  ▼  1          3     9   9  10  10  11  11   <- Take the stored generations and use the fourth slice to compute the loss
+        #
+        #                      2          4    12  12  13  13  14  14   <- Generate for the second `steps_per_generation` (prompts 12 to 23); store the completions; use the first slice to compute the loss
+        #                      2          5    15  15  16  16  17  17   <- Take the stored generations and use the second slice to compute the loss
+        #                                          ...
+        if dataset is None:
+            dataset = self.train_dataset
+        return RepeatSampler(
+            data_source=dataset,
+            mini_repeat_count=self.num_generations,
+            batch_size=self.args.generation_batch_size // self.num_generations,
+            repeat_count=self.num_iterations * self.args.steps_per_generation,
+            shuffle=self.shuffle_dataset,
+            seed=self.args.seed,
+        )
+
+    def _get_eval_sampler(self, eval_dataset) -> Sampler:
+        # See _get_train_sampler for an explanation of the sampler.
+        return RepeatSampler(
+            data_source=eval_dataset,
+            mini_repeat_count=self.num_generations,
+            seed=self.args.seed,
+        )
+
+    @profiling_decorator
+    def _get_last_hidden_state(
+        self,
+        unwrapped_model,
+        input_ids,
+        attention_mask,
+        logits_to_keep,
+        pixel_values=None,
+        image_grid_thw=None,
+        pixel_attention_mask=None,
+        image_sizes=None,
+    ):
+        if is_peft_model(unwrapped_model):
+            unwrapped_model = unwrapped_model.base_model.model
+
+        # Build model inputs - check if the model supports logits_to_keep (some models and VLMs don't)
+        model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+
+        # For Qwen models:
+        if image_grid_thw is not None and pixel_values is not None:
+            model_inputs["image_grid_thw"] = image_grid_thw
+        # For Gemma, SmolVLM2, LLaVa-Next etc.:
+        if pixel_values is not None:
+            model_inputs["pixel_values"] = pixel_values
+        # For SmolVLM2
+        if pixel_attention_mask is not None:
+            model_inputs["pixel_attention_mask"] = pixel_attention_mask
+        # For LLaVa-Next
+        if image_sizes is not None:
+            model_inputs["image_sizes"] = image_sizes
+
+        # Only add logits_to_keep if the model supports it
+        if "logits_to_keep" in self.model_kwarg_keys:
+            # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
+            model_inputs["logits_to_keep"] = logits_to_keep + 1
+
+        model_inputs["use_cache"] = False  # only used in generation; set False to suppress warnings
+
+        last_hidden_state = unwrapped_model.model(**model_inputs).last_hidden_state
+        # Exclude the last value: it corresponds to the next token pred
+        last_hidden_state = last_hidden_state[:, :-1, :]  # (B, L-1, H)
+        # Only keep the last logits_to_keep. For model that support logits_to_keep, this is a no-op.
+        last_hidden_state = last_hidden_state[:, -logits_to_keep:, :]  # (B, logits_to_keep, H)
+        return last_hidden_state
+
+    def get_high_entropy_mask(self, entropies: torch.Tensor, mask: torch.Tensor, threshold: float) -> torch.Tensor:
+        """
+        Returns a binary mask identifying tokens whose entropy exceeds a given quantile threshold.
+
+        Args:
+            entropies (`torch.Tensor`):
+                Tensor of shape (batch_size, seq_len) with per-token entropy values.
+            mask (`torch.Tensor`):
+                Binary mask of the same shape as `entropies`, where `1` indicates valid tokens and `0` padding.
+            threshold (`float`):
+                Quantile threshold between `0.0` and `1.0` to select high-entropy tokens.
+
+        Returns:
+            `torch.Tensor`:
+                Boolean mask of shape (batch_size, seq_len), where `True` indicates tokens with entropy >= threshold
+                and `False` otherwise.
+        """
+        non_pad_entropies = entropies[mask.bool()].float()
+        if non_pad_entropies.numel() == 0:
+            return torch.zeros_like(entropies, dtype=torch.bool)
+
+        # The shape of non_pad_entropies can be different on each gpu/device.
+        # this can cause the gather operation to hang. So we first gather the lengths
+        # of non_pad_entropies and pad them to the max length before doing a gather.
+        non_pad_entropies_seq_length = torch.tensor([non_pad_entropies.numel()], device=entropies.device)
+        max_non_pad_entropies_seq_length = self.accelerator.gather(non_pad_entropies_seq_length).max().item()
+        padding = torch.zeros(
+            max_non_pad_entropies_seq_length - non_pad_entropies.numel(), device=non_pad_entropies.device
+        )
+        padded_entropies = torch.cat([non_pad_entropies, padding])
+        padded_entropies_mask = torch.cat([torch.ones_like(non_pad_entropies), padding])
+        all_padded_entropies = self.accelerator.gather(padded_entropies)
+        all_padded_entropies_mask = self.accelerator.gather(padded_entropies_mask)
+        # Filter out entropies corresponding to padding.
+        all_non_padded_entropies = all_padded_entropies[all_padded_entropies_mask.bool()]
+        entropy_threshold = torch.quantile(all_non_padded_entropies, threshold)
+        masked_entropies = entropies * mask.float()
+        entropy_mask = masked_entropies >= entropy_threshold
+        return entropy_mask & mask.bool()  # ensure padding tokens are always masked out
+
+    def _get_per_token_logps_and_entropies(self, model, input_ids, attention_mask, logits_to_keep, batch_size = None,
+                                           compute_entropy = False, compute_efficient = False, *args, **kwargs):
+        # if True: # os.environ.get('UNSLOTH_USE_NEW_MODEL', '0') == '0':
+        #     return None, None  # logps, entropies Unsloth efficient GRPO
+        if compute_efficient:
+            return None, None
+        else:
+            # Otherwise, calculate normally:
+            if not hasattr(self, '_autocast_dtype'):
+                self._autocast_dtype = torch.float16 if os.environ.get('ACCELERATE_MIXED_PRECISION', 'fp16') == 'fp16' else torch.bfloat16
+                if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': self._autocast_dtype = torch.float16
+
+            pixel_values, image_grid_thw = kwargs.get("pixel_values", None), kwargs.get("image_grid_thw", None)
+            pixel_attention_mask, image_sizes = kwargs.get('pixel_attention_mask',None), kwargs.get('image_sizes',None)
+
+            os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1"
+
+            unwrapped_model = self.accelerator.unwrap_model(model, keep_fp32_wrapper=False)
+
+            with torch.amp.autocast(device_type = 'cuda', dtype = self._autocast_dtype):
+                with torch.inference_mode():
+                    if pixel_values is None:
+                        attention_mask =  input_ids != self.processing_class.pad_token_id
+                        attention_mask = attention_mask.to(attention_mask.dtype)
+                        # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
+                        logits = unwrapped_model(
+                            input_ids = input_ids,
+                            attention_mask = attention_mask,
+                            pixel_values = pixel_values,
+                            image_grid_thw = image_grid_thw,
+                            pixel_attention_mask = pixel_attention_mask,
+                            image_sizes = image_sizes,
+                            #logits_to_keep = logits_to_keep + 1,
+                        ).logits
+                    else:
+                        logits = unwrapped_model(
+                            input_ids = input_ids,
+                            attention_mask = attention_mask,
+                            pixel_values = pixel_values,
+                            image_grid_thw = image_grid_thw,
+                            pixel_attention_mask = pixel_attention_mask,
+                            image_sizes = image_sizes,
+                            logits_to_keep = logits_to_keep + 1,
+                        ).logits
+                entropies = None
+                if compute_entropy:
+                    from trl.trainer.utils import entropy_from_logits
+                    entropies = entropy_from_logits(logits)
+            os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "0"
+            # logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+            return logits, entropies  # logps, entropies
+            # input_ids = input_ids[:, -logits_to_keep:]
+            # For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves.
+            # See https://github.com/huggingface/trl/issues/2770
+            # logits = logits[:, -logits_to_keep:]
+            # return logits
+            # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+            # logits = logits / self.temperature
+            # logps = selective_log_softmax(logits, input_ids)
+
+            # row_indices, col_indices = torch.where(logps < -20)
+
+            # # Method 1: Check if tensors have elements
+            # if len(row_indices) > 0 and len(col_indices) > 0:
+            #     breakpoint()  # Breakpoint triggered here
+            #     print("Found high values!")
+            # return  logps #  compute logprobs for the input tokens
+        pass
+
+    def _fix_param_name_to_vllm(self, name, extra_prefixes: Optional[list[str]] = None):
+        extra_prefixes = extra_prefixes or []
+        prefixes = ["_checkpoint_wrapped_module."] + extra_prefixes
+        for prefix in prefixes:
+            name = name.replace(prefix, "")
+        return name
+
+    def _sync_fsdp1_params_to_vllm(self, module: nn.Module, prefix: str = "", visited=None):
+        """Memory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM."""
+        # For FSDP1, we need to recurse into children and also use summon_full_params
+        if visited is None:
+            visited = set()
+        for child_name, child_module in module.named_children():
+            child_prefix = f"{prefix}.{child_name}" if prefix else child_name
+            self._sync_fsdp1_params_to_vllm(
+                child_module, prefix=child_prefix, visited=visited
+            )  # recurse into the child
+
+        if isinstance(module, FSDP):
+            with FSDP.summon_full_params(module, recurse=False, writeback=False):
+                for param_name, param in module.named_parameters():
+                    full_name = f"{prefix}.{param_name}" if prefix else param_name
+                    full_name = self._fix_param_name_to_vllm(full_name, extra_prefixes=["_fsdp_wrapped_module."])
+
+                    if full_name in visited:
+                        continue  # skip FSDP subtrees already traversed
+                    visited.add(full_name)
+
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(full_name, param.data)
+                    elif self.vllm_mode == "colocate":
+
+                        pass
+
+                        pass
+
+    def _sync_fsdp2_params_to_vllm(self, module: nn.Module):
+        # For FSDP2, module already covers all parameters, so no need for recursion
+        for name, param in module.items():
+            if param.is_cpu:
+                param = param.to(torch.device("cuda"))
+            param = param.full_tensor()
+
+            if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                self.vllm_client.update_named_param(name, param)
+            elif self.vllm_mode == "colocate":
+
+                pass
+
+                pass
+
+    def _move_model_to_vllm(self, *args, **kwargs): return None
+
+    @profiling_decorator
+    def _prepare_inputs(
+        self, generation_batch: dict[str, Union[torch.Tensor, Any]]
+    ) -> dict[str, Union[torch.Tensor, Any]]:
+        # Prepares inputs for model training/evaluation by managing completion generation and batch handling.
+        # During training:
+        #   - Receives the local generation batch (Per-GPU batch size × steps per generation)
+        #     from the modified training dataloader instead of the standard local batch
+        #   - Generates completions once for the entire generation batch and splits it into batches of size
+        #     `per_device_train_batch_size`
+        #   - Buffers these completions and returns the appropriate slice for the current accumulation step
+        #   - Optimizes by regenerating completions only periodically (every steps_per_generation * num_iterations)
+        # During evaluation:
+        #   - The input is treated as a standard local batch (no accumulation, no multiple iterations)
+        #   - Completions are generated for each batch without buffering or reuse
+        # Returns a single local batch in both cases.
+
+        mode = "train" if self.model.training else "eval"
+        if mode == "train":
+            generate_every = self.args.steps_per_generation * self.num_iterations
+            if self._step % generate_every == 0 or self._buffered_inputs is None:
+                # self._buffered_inputs=None can occur when resuming from a checkpoint
+                generation_batch = self._generate_and_score_completions(generation_batch)
+                generation_batch = split_pixel_values_by_grid(generation_batch)
+
+                try: generation_batch = shuffle_sequence_dict(generation_batch)
+
+                except: pass
+                generation_batches = split_tensor_dict(generation_batch, self.args.steps_per_generation)
+                self._buffered_inputs = [unsplit_pixel_values_by_grid(batch) for batch in generation_batches]
+            inputs = self._buffered_inputs[self._step % self.args.steps_per_generation]
+            self._step += 1
+        else:
+            # In evaluation, there is neither batch grouping for generation, nor multiple iterations, hence
+            # local generation batch == local eval batch
+            inputs = self._generate_and_score_completions(generation_batch)
+        return inputs
+
+    @profiling_decorator
+    def _calculate_rewards(self, inputs, prompts, completions, completion_ids_list):
+        device = self.accelerator.device
+        rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device)
+
+        # Repeat all input columns (but "prompt", "completion", and "completion_ids") to match the num of generations
+        keys = [key for key in inputs[0] if key not in ["prompt", "completion", "completion_ids"]]
+        reward_kwargs = {key: [example[key] for example in inputs] for key in keys}
+
+        # This allows for dynamic reward shaping based on training progress.
+        reward_kwargs["trainer_state"] = self.state
+
+        for i, (reward_func, reward_processing_class, reward_func_name) in enumerate(
+            zip(self.reward_funcs, self.reward_processing_classes, self.reward_func_names)
+        ):
+            with profiling_context(self, reward_func_name):
+                if isinstance(reward_func, nn.Module):  # Module (no PretrainedModel) for compat with compiled models
+                    if is_conversational(inputs[0]):
+                        messages = [{"messages": p + c} for p, c in zip(prompts, completions)]
+                        texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages]
+                    else:
+                        texts = [p + c for p, c in zip(prompts, completions)]
+                    reward_inputs = reward_processing_class(
+                        text=texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False
+                    )
+                    reward_inputs = super()._prepare_inputs(reward_inputs)
+                    with torch.inference_mode():
+                        rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0]  # Shape (B*G,)
+                else:
+                    output_reward_func = reward_func(
+                        prompts=prompts, completions=completions, completion_ids=completion_ids_list, **reward_kwargs
+                    )
+                    # Convert None values to NaN
+                    output_reward_func = [reward if reward is not None else torch.nan for reward in output_reward_func]
+
+                    rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device)
+
+        # If all reward functions return None for a given row, issue a detailed warning
+        if torch.isnan(rewards_per_func).all(dim=1).any():
+            nan_row_idx = torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0]
+            row_reward_kwargs = {
+                key: value[nan_row_idx] for key, value in reward_kwargs.items() if key != "trainer_state"
+            }
+            row_reward_kwargs["prompt"] = prompts[nan_row_idx]
+            row_reward_kwargs["completion"] = completions[nan_row_idx]
+            logger.warning(
+                f"All reward functions returned None for the following kwargs:\n{row_reward_kwargs}\n"
+                "Please ensure that at least one reward function returns a valid reward."
+            )
+
+        # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
+        # completions may be distributed across processes
+        rewards_per_func = gather(rewards_per_func)
+        return rewards_per_func
+
+    def _generate_and_score_completions(
+        self, inputs: list[dict[str, Union[torch.Tensor, Any]]]
+    ) -> dict[str, Union[torch.Tensor, Any]]:
+        device = self.accelerator.device
+        mode = "train" if self.model.training else "eval"
+
+        prompts = [x["prompt"] for x in inputs]
+
+        # We don't yet support visual reward models/function, so we keep a copy of the original text-only prompts for
+        # later use in the reward computation. If images are present, we insert {"type": "image"} as required by the
+        # VLM chat template.
+        original_prompts = copy.deepcopy(prompts)
+
+        # If the prompts are conversational and the inputs contain images, we need to convert the prompts from
+        # [{"role": "user", "content": "What color is the sky?"}] to
+        # [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What color is the sky?"}]}]
+        kwargs = {}
+        has_images = "image" in inputs[0]
+        if has_images:
+            images = [example.get("image") for example in inputs]
+            kwargs = {"images": [[img] for img in images]}
+            for prompt in prompts:
+                if isinstance(prompt, list):  # i.e., when using conversational data
+                    prepare_multimodal_messages(prompt, num_images=1)
+
+        
+        _chat_template_ = getattr(self.processing_class, "chat_template", None)
+        if _chat_template_ is None: _chat_template_ = ""
+        _supported_keys_ = set(("prompt", "chosen", "rejected", "completion", "messages", "label"))
+
+        prompts_text = []
+        for _example_ in inputs:
+            _tokenizer_kwargs_ = {}
+            if type(_example_) is not dict:
+                _example_ = {"prompt": _example_}
+            _left_keys_ = _example_.keys() - _supported_keys_
+            for k in _left_keys_:
+                if k in _chat_template_:
+                    v = _example_[k]
+                    if type(v) is str:
+                        _tokenizer_kwargs_[k] = v
+            _x_ = maybe_apply_chat_template(_example_, self.processing_class, **_tokenizer_kwargs_)["prompt"]
+            prompts_text.append(_x_)
+        prompt_inputs = self.processing_class(
+            text=prompts_text,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left",
+            add_special_tokens=False,
+            **kwargs,
+        )
+        prompt_inputs = super()._prepare_inputs(prompt_inputs)
+        prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"]
+        if self.max_prompt_length is not None:
+            # If max_prompt_length is set, we trim the prompt to keep only the last `max_prompt_length` tokens.
+            # Then we decode those tokens back into text. We manually remove leading pad tokens from the decoded text,
+            # because we can't use `skip_special_tokens=True` (some special tokens are still needed for generation).
+            protected = [self.image_token_id, self.vision_start_token_id, self.vision_end_token_id]
+            protected = [token for token in protected if token is not None]
+            prompt_ids, prompt_mask = truncate_with_protected_tokens(
+                prompt_ids, prompt_mask, self.max_prompt_length, protected
+            )
+
+            prompts_text = [re.sub(rf"^({re.escape(self.pad_token)})+", "", text) for text in prompts_text]
+
+            # The chat template inserts a single image token into the prompt text. However, when this text is later
+            # tokenized, the single image token string is expanded into multiple image token IDs, depending on the
+            # image size. Since we're detokenizing here, we may see repeated image tokens in the decoded text. We
+            # collapse them back into a single token string to match the original template.
+            if self.image_token is not None:
+                prompts_text = [
+                    re.sub(rf"({re.escape(self.image_token)})+", self.image_token, text) for text in prompts_text
+                ]
+        # Generate completions using either vLLM or regular generation
+        if self.use_vllm:
+            if self.vllm_mode == "colocate" and self.args.vllm_enable_sleep_mode:
+                # wake up colocated vLLM instances if needed
+                torch.cuda.empty_cache()  # required to avoid OOM in some cases
+                self.llm.wake_up()
+
+            # First, update the vLLM weights if needed
+            if self.state.global_step != self._last_loaded_step:
+                self._move_model_to_vllm()
+                self._last_loaded_step = self.state.global_step
+
+            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
+            if self.vllm_mode == "server":
+                all_prompts_text = gather_object(prompts_text)
+                if has_images:
+                    all_images = gather_object(images)
+
+                if self.accelerator.is_main_process:
+                    # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
+                    # num_generations outputs for each one. This is faster than generating outputs for each duplicate
+                    # prompt individually.
+                    ordered_set_of_prompts = all_prompts_text[:: self.num_generations]
+
+                    if has_images:
+                        ordered_set_of_images = all_images[:: self.num_generations]
+                    else:
+                        ordered_set_of_images = None
+
+                    with profiling_context(self, "vLLM.generate"):
+                        output = self.vllm_client.generate(
+                            prompts=ordered_set_of_prompts,
+                            images=ordered_set_of_images,
+                            n=self.num_generations,
+                            repetition_penalty=self.repetition_penalty,
+                            temperature=self.temperature,
+                            top_p=self.top_p,
+                            top_k=-1 if self.top_k is None else self.top_k,
+                            min_p=0.0 if self.min_p is None else self.min_p,
+                            max_tokens=self.max_completion_length,
+                            guided_decoding_regex=self.guided_decoding_regex,
+                            generation_kwargs=self.args.generation_kwargs,
+                        )
+                        payload = (output["completion_ids"], output["logprobs"])
+                else:
+                    payload = None
+
+                # Broadcast the completions from the main process to all processes, ensuring each process receives its corresponding slice.
+                obj_list = [payload]
+                broadcast_object_list(obj_list, from_process=0)
+                completion_ids, all_logprobs = obj_list[0]
+
+                process_slice = slice(
+                    self.accelerator.process_index * len(prompts),
+                    (self.accelerator.process_index + 1) * len(prompts),
+                )
+                completion_ids = completion_ids[process_slice]
+                all_logprobs = all_logprobs[process_slice]
+
+            # Generate completions using colocated vLLM instances: each device holds vLLM copy and work on their own batch of prompts
+            elif self.vllm_mode == "colocate":
+                if self.guided_decoding_regex:
+                    guided_decoding = GuidedDecodingParams(regex=self.guided_decoding_regex)
+                else:
+                    guided_decoding = None
+
+                generation_kwargs = {
+                    "n": 1,  # vLLM on each GPU generates only 1 in colocate mode
+                    "repetition_penalty": self.repetition_penalty,
+                    "temperature": self.temperature,
+                    "top_p": self.top_p,
+                    "top_k": -1 if self.top_k is None else self.top_k,
+                    "min_p": 0.0 if self.min_p is None else self.min_p,
+                    "max_tokens": self.max_completion_length,
+                    "guided_decoding": guided_decoding,
+                    "logprobs": 0,  # only return the logprob of the generated token
+                }
+                if self.args.generation_kwargs is not None:
+                    generation_kwargs.update(self.args.generation_kwargs)
+                sampling_params = SamplingParams(**generation_kwargs)
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Gather prompts from all ranks in the TP group and flatten.
+                    # Each rank starts with its own prompts; after gathering, all ranks see the full group set.
+                    orig_size = len(prompts_text)
+                    gathered_prompts = [None for _ in range(self.vllm_tensor_parallel_size)]
+                    torch.distributed.all_gather_object(gathered_prompts, prompts_text, group=self.tp_group)
+                    all_prompts_text = [p for sublist in gathered_prompts for p in sublist]
+
+                    if has_images:
+                        gathered_images = [None for _ in range(self.vllm_tensor_parallel_size)]
+                        torch.distributed.all_gather_object(gathered_images, images, group=self.tp_group)
+                        all_images = [img for sublist in gathered_images for img in sublist]
+                    else:
+                        all_images = None
+                else:
+                    all_prompts_text = prompts_text
+                    all_images = images if has_images else None
+
+                if has_images and all_images:
+                    vllm_inputs = []
+                    for prompt, image in zip(all_prompts_text, all_images):
+                        if image is not None:
+                            vllm_inputs.append({"prompt": prompt, "multi_modal_data": {"image": image}})
+                        else:
+                            vllm_inputs.append(prompt)
+                else:
+                    vllm_inputs = all_prompts_text
+
+                with profiling_context(self, "vLLM.generate"):
+                    all_outputs = self.llm.generate(vllm_inputs, sampling_params=sampling_params, use_tqdm=False, lora_request = self.model.load_lora('grpo_trainer_lora_model', load_tensors = True))
+
+                completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs]
+                from trl.scripts.vllm_serve import sanitize_logprob
+                all_logprobs = [
+                    [sanitize_logprob(next(iter(logprob.values()))) for logprob in output.logprobs]
+                    for outputs in all_outputs
+                    for output in outputs.outputs
+                ]
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Slice completions for this rank within its TP group.
+                    # Each rank generates all outputs — we keep only our share.
+                    local_rank_in_group = torch.distributed.get_rank(group=self.tp_group)
+                    tp_slice = slice(local_rank_in_group * orig_size, (local_rank_in_group + 1) * orig_size)
+                    completion_ids = completion_ids[tp_slice]
+                    all_logprobs = all_logprobs[tp_slice]
+
+                if self.args.vllm_enable_sleep_mode:
+                    self.llm.sleep(level=1)
+
+            # Pad the completions, and concatenate them with the prompts
+            completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
+            completion_ids = pad(completion_ids, padding_value=self.pad_token_id)
+            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+            sampling_per_token_logps = [
+                torch.tensor(logprobs, device=device, dtype=torch.float32) for logprobs in all_logprobs
+            ]
+            sampling_per_token_logps = pad(sampling_per_token_logps, padding_value=0.0)
+
+        elif self.use_transformers_paged:
+            # Re-process inputs for paged generation if needed
+            # Note: images are already validated and preprocessed above
+            paged_prompt_inputs = self.processing_class(text=prompts_text, **kwargs)
+            previous_attn = self.model_wrapped.config._attn_implementation
+
+            if is_flash_attn_2_available():
+                self.model_wrapped.config._attn_implementation = "paged_attention"
+            else:
+                self.model_wrapped.config._attn_implementation = "sdpa_paged"
+            with (
+                profiling_context(self, "transformers.generate_batch"),
+                unwrap_model_for_generation(
+                    self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False) if self.is_fsdp_enabled else nullcontext(),
+            ):
+                # Cast to the appropriate dtype based on training configuration
+                if self.args.bf16:
+                    unwrapped_model.to(torch.bfloat16)
+                elif self.args.fp16:
+                    unwrapped_model.to(torch.float16)
+                with torch.inference_mode():
+                    all_outputs = unwrapped_model.generate_batch(
+                        paged_prompt_inputs.input_ids, generation_config=self.generation_config, progress_bar=False
+                    )
+            completion_ids = [output.generated_tokens for output in all_outputs.values()]
+            completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
+            completion_ids = pad(completion_ids, padding_value=self.pad_token_id, padding_side="right")
+            prompt_ids = [torch.tensor(ids, device=device) for ids in paged_prompt_inputs.input_ids]
+            prompt_ids = pad(prompt_ids, padding_value=self.pad_token_id, padding_side="left")
+            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+            # Restore the original attention implementation, training mode
+            self.model_wrapped.config._attn_implementation = previous_attn
+        else:
+            # Regular generation path
+            with (
+                profiling_context(self, "transformers.generate"),
+                unwrap_model_for_generation(
+                    self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False) if self.is_fsdp_enabled else nullcontext(),
+            ):
+                prompt_inputs["input_ids"], prompt_inputs["attention_mask"] = prompt_ids, prompt_mask
+                prompt_completion_ids = unwrapped_model.generate(
+                    **prompt_inputs, generation_config=self.generation_config, disable_compile=True
+                )
+            # Compute prompt length and extract completion ids
+            prompt_length = prompt_ids.size(1)
+            prompt_ids = prompt_completion_ids[:, :prompt_length]
+            completion_ids = prompt_completion_ids[:, prompt_length:]
+
+        # Mask everything after the first EOS token
+        is_eos = completion_ids == self.eos_token_id
+        eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
+        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
+        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
+        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
+
+        # Convert tensor to a list of lists of token IDs. This will be passed to the reward function, avoiding the need
+        # to re-tokenize completions if the reward is computed from tokens.
+        completion_ids_list = [row[mask_row].tolist() for row, mask_row in zip(completion_ids, completion_mask.bool())]
+
+        # Sum along sequence dimension (dim=1) to get completion length per sequence, used for logging
+        completion_lengths = completion_mask.sum(1)
+        agg_completion_lengths = self.accelerator.gather(completion_lengths)
+        num_items_in_batch = agg_completion_lengths.sum()  # this is required for the DAPO loss
+
+        # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
+        if self.mask_truncated_completions:
+            truncated_completions = ~is_eos.any(dim=1)
+            completion_mask = completion_mask * (~truncated_completions).unsqueeze(1).int()
+
+        # Concatenate prompt_mask with completion_mask for logit computation
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
+
+        logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+        
+        batch_size = self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_size
+        try:
+            # TRL 0.23.1 and below path
+            if not has_images:
+                # Left pad prompt before calculation old and ref hidden states
+                prompt_completion_ids = left_pack_padding(prompt_completion_ids, self.processing_class.pad_token_id)
+            self.model.for_training()
+        except:
+            # TRL 0.24.0 and below path
+            if images is None:
+                # Left pad prompt before calculation old and ref hidden states
+                prompt_completion_ids = left_pack_padding(prompt_completion_ids, self.processing_class.pad_token_id)
+        self.model.for_training()
+
+        with torch.no_grad():
+            # If the generation and optimization steps are misaligned—i.e., if generation does not occur at the end of
+            # a full optimizer step (when gradient_accumulation_steps is not a multiple of generate_every)—then the
+            # samples may come from an earlier version of the model. In that case, we need to track old_per_token_logps
+            # for importance sampling. If the steps are aligned, importance sampling isn't necessary and we set
+            # old_per_token_logps to None.
+            # When using vLLM, we always compute old_per_token_logps for importance sampling, it was shown that the
+            # distribution mismatch between vLLM and the training model can be large and harm the training.
+            generate_every = self.args.steps_per_generation * self.num_iterations  # generation frequency
+        
+            if self.args.gradient_accumulation_steps % generate_every != 0 or (
+                self.use_vllm
+            ):
+                old_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                    self.model,
+                    prompt_completion_ids,
+                    attention_mask,
+                    logits_to_keep,
+                    batch_size,
+                    pixel_values=prompt_inputs.get("pixel_values"),
+                    image_grid_thw=prompt_inputs.get("image_grid_thw"),
+                    pixel_attention_mask=prompt_inputs.get("pixel_attention_mask"),
+                    image_sizes=prompt_inputs.get("image_sizes"),
+                )
+            else:
+                old_per_token_logps = None
+
+            # Compute the importance sampling ratio when using vLLM, to correct for potential distribution mismatch
+            if self.use_vllm and self.vllm_importance_sampling_correction:
+                importance_sampling_ratio = torch.exp(old_per_token_logps - sampling_per_token_logps)
+                importance_sampling_ratio = torch.clamp(
+                    importance_sampling_ratio, max=self.vllm_importance_sampling_cap
+                )
+
+            # Compute the per-token log probabilities for the reference model
+            if self.beta != 0.0:
+                if self.ref_model is not None:
+                    ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                        self.ref_model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        batch_size=batch_size,
+                        pixel_values=prompt_inputs.get("pixel_values"),
+                        image_grid_thw=prompt_inputs.get("image_grid_thw"),
+                        pixel_attention_mask=prompt_inputs.get("pixel_attention_mask"),
+                        image_sizes=prompt_inputs.get("image_sizes"),
+                    )
+                else:
+                    with self.accelerator.unwrap_model(self.model).disable_adapter():
+                        ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                            self.model,
+                            prompt_completion_ids,
+                            attention_mask,
+                            logits_to_keep,
+                            batch_size=batch_size,
+                            pixel_values=prompt_inputs.get("pixel_values"),
+                            image_grid_thw=prompt_inputs.get("image_grid_thw"),
+                            pixel_attention_mask=prompt_inputs.get("pixel_attention_mask"),
+                            image_sizes=prompt_inputs.get("image_sizes"),
+                        )
+            else:
+                ref_per_token_logps = None
+
+        # Decode the generated completions
+        completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True)
+        if is_conversational(inputs[0]):
+            completions = []
+            for prompt, completion in zip(prompts, completions_text):
+                bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                completions.append([{"role": "assistant", "content": bootstrap + completion}])
+        else:
+            completions = completions_text
+
+        # Calculate rewards for each reward function. rewards_per_func aggregates rewards across all processes. This is
+        # important because rewards will be normalized per group, and completions are distributed. We will later slice
+        # rewards_per_func to extract each process's subset.
+        rewards_per_func = self._calculate_rewards(inputs, original_prompts, completions, completion_ids_list)
+
+        # Apply weights to each reward function's output and sum
+        rewards = (rewards_per_func * self.reward_weights.to(device).unsqueeze(0)).nansum(dim=1)
+
+        # Compute grouped-wise rewards
+        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
+
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        advantages = rewards - mean_grouped_rewards
+
+        if self.scale_rewards in ["group", "none"]:
+            # If self.scale_rewards = "none", we'll still log group level std
+            std_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+            std_rewards = std_rewards.repeat_interleave(self.num_generations, dim=0)
+        elif self.scale_rewards == "batch":
+            # Compute global std
+            std_rewards = rewards.std().expand_as(rewards)
+        else:
+            raise ValueError(
+                f"Invalid value for scale_rewards: {self.scale_rewards}. Must be one of 'batch', 'group', or 'none'."
+            )
+
+        is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards))
+        if self.scale_rewards != "none":
+            advantages = advantages / (std_rewards + 1e-4)
+
+        # Slice to keep only the local part of the data
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts),
+            (self.accelerator.process_index + 1) * len(prompts),
+        )
+        all_process_advantages = advantages.clone()  # keep the aggregated advantages for logging
+        advantages = advantages[process_slice]
+
+        # Log the metrics
+        if mode == "train":
+            self.state.num_input_tokens_seen += self.accelerator.gather(attention_mask.sum()).sum().item()
+        self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen]
+
+        # Log completion lengths, mean, min, max
+        self._metrics[mode]["completions/mean_length"].append(agg_completion_lengths.float().mean().item())
+        self._metrics[mode]["completions/min_length"].append(agg_completion_lengths.float().min().item())
+        self._metrics[mode]["completions/max_length"].append(agg_completion_lengths.float().max().item())
+
+        # Identify sequences that terminated with EOS and log their lengths
+        agg_terminated_with_eos = self.accelerator.gather(is_eos.any(dim=1))
+        term_completion_lengths = agg_completion_lengths[agg_terminated_with_eos]
+        clipped_completions_ratio = 1 - len(term_completion_lengths) / len(agg_completion_lengths)
+        self._metrics[mode]["completions/clipped_ratio"].append(clipped_completions_ratio)
+        if len(term_completion_lengths) == 0:  # edge case where no terminated sequences are found
+            term_completion_lengths = torch.zeros(1, device=device)
+        self._metrics[mode]["completions/mean_terminated_length"].append(term_completion_lengths.float().mean().item())
+        self._metrics[mode]["completions/min_terminated_length"].append(term_completion_lengths.float().min().item())
+        self._metrics[mode]["completions/max_terminated_length"].append(term_completion_lengths.float().max().item())
+
+        # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
+        for i, reward_func_name in enumerate(self.reward_func_names):
+            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards)
+            std_func_rewards = nanstd(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/std"].append(std_func_rewards)
+        self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item())
+        self._metrics[mode]["reward_std"].append(std_rewards.mean().item())
+        self._metrics[mode]["frac_reward_zero_std"].append(is_std_zero.float().mean().item())
+
+        # Log prompt and completion texts
+        self._logs["prompt"].extend(gather_object(prompts_text))
+        self._logs["completion"].extend(gather_object(completions_text))
+        for i, name in enumerate(self.reward_func_names):
+            self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist())
+        self._logs["advantages"].extend(all_process_advantages.tolist())
+
+        if has_images:
+            self._logs["image"].extend(gather_object(images))
+
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            delta = torch.abs(old_per_token_logps - sampling_per_token_logps)
+            delta = delta[completion_mask.bool()]
+            mean_delta = torch.mean(delta) if delta.numel() > 0 else torch.tensor(0.0, device=device)
+            max_delta = torch.max(delta) if delta.numel() > 0 else torch.tensor(0.0, device=device)
+            self._metrics[mode]["sampling/sampling_logp_difference/mean"].append(
+                self.accelerator.gather(mean_delta).mean().item()
+            )
+            self._metrics[mode]["sampling/sampling_logp_difference/max"].append(
+                self.accelerator.gather(max_delta).max().item()
+            )
+
+            flat_is_ratio = importance_sampling_ratio[completion_mask.bool()]
+            min_importance_sampling_ratio = (
+                torch.min(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=device)
+            )
+            mean_importance_sampling_ratio = (
+                torch.mean(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=device)
+            )
+            max_importance_sampling_ratio = (
+                torch.max(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=device)
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/min"].append(
+                nanmin(self.accelerator.gather(min_importance_sampling_ratio)).item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/mean"].append(
+                self.accelerator.gather(mean_importance_sampling_ratio).nanmean().item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/max"].append(
+                nanmax(self.accelerator.gather(max_importance_sampling_ratio)).item()
+            )
+
+        output = {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "advantages": advantages,
+            "num_items_in_batch": num_items_in_batch,
+        }
+        if old_per_token_logps is not None:
+            output["old_per_token_logps"] = old_per_token_logps
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            output["importance_sampling_ratio"] = importance_sampling_ratio
+        if ref_per_token_logps is not None:
+            output["ref_per_token_logps"] = ref_per_token_logps
+        if "pixel_values" in prompt_inputs:
+            output["pixel_values"] = prompt_inputs["pixel_values"]
+        if "image_grid_thw" in prompt_inputs:
+            output["image_grid_thw"] = prompt_inputs["image_grid_thw"]
+        if "pixel_attention_mask" in prompt_inputs:
+            output["pixel_attention_mask"] = prompt_inputs["pixel_attention_mask"]
+        if "image_sizes" in prompt_inputs:
+            output["image_sizes"] = prompt_inputs["image_sizes"]
+        
+        if self.use_vllm:
+            try:
+                output["sampling_per_token_logps"] = sampling_per_token_logps
+            except NameError:
+                output["sampling_per_token_logps"] = None
+        return output
+
+    def compute_liger_loss(self, unwrapped_model, inputs):
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+
+        # Get the last hidden state of the model
+        last_hidden_state = self._get_last_hidden_state(
+            unwrapped_model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            inputs.get("pixel_values"),
+            inputs.get("image_grid_thw"),
+            inputs.get("pixel_attention_mask"),
+            inputs.get("image_sizes"),
+        )
+
+        # compute loss and metrics using liger grpo loss
+        loss, metrics = self.liger_grpo_loss(
+            _input=last_hidden_state,
+            lin_weight=unwrapped_model.lm_head.weight,
+            selected_token_ids=completion_ids,
+            attention_mask=completion_mask,
+            advantages=inputs["advantages"],
+            bias=unwrapped_model.lm_head.bias,
+            old_per_token_logps=inputs.get("old_per_token_logps"),
+            ref_per_token_logps=inputs.get("ref_per_token_logps"),
+        )
+        # Extract metrics from the liger_grpo_loss output
+        # KL divergence is the first metric when beta is non-zero
+        mean_kl = metrics[0] if self.beta != 0.0 else None
+        clip_ratio = metrics[-1]
+
+        mode = "train" if self.model.training else "eval"
+        if self.beta != 0.0:
+            self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).mean().item())
+        self._metrics[mode]["clip_ratio"].append(self.accelerator.gather(clip_ratio).mean().item())
+        return loss / self.current_gradient_accumulation_steps
+
+    def compute_loss(self, model, inputs, return_outputs = False, num_items_in_batch = None):
+        if return_outputs:
+            raise ValueError("The GRPOTrainer does not support returning outputs")
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
+        pixel_values, image_grid_thw = inputs.get("pixel_values", None), inputs.get("image_grid_thw", None)
+        pixel_attention_mask, image_sizes = inputs.get('pixel_attention_mask',None), inputs.get('image_sizes',None)
+        num_items_in_batch  = inputs.get("num_items_in_batch", None)
+        sampling_per_token_logps = inputs.get("sampling_per_token_logps", None)   
+        current_gradient_accumulation_steps = self.current_gradient_accumulation_steps
+        num_processes = self.accelerator.num_processes
+
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        bsz, qlen = input_ids.shape
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        # attention_mask = None
+        logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+        _input_ids = input_ids
+        _logits_to_keep = logits_to_keep
+
+        get_logps_func = \
+            lambda model, input_ids, attention_mask, logits_to_keep, batch_size=None, compute_entropy=False, compute_efficient = False: \
+            self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep, compute_efficient) \
+            if hasattr(self, "_get_per_token_logps") else \
+            self._get_per_token_logps_and_entropies(model, input_ids, attention_mask, logits_to_keep, batch_size, compute_entropy, compute_efficient)[0]  # logps
+
+        per_token_logps = get_logps_func(model, input_ids, attention_mask, logits_to_keep, compute_efficient = True)
+        # Compute the KL divergence between the model and the reference model
+        # _prepare_inputs doesn't return reference log probs anymore. We need to calculate it ourselves.
+        # https://github.com/huggingface/trl/blob/05bc43e960396581e458195b8388efe6b82cae1f/trl/trainer/grpo_trainer.py#L1328
+        # if self.beta != 0.0:
+        #     with torch.inference_mode(), model.disable_adapter():
+        #         ref_per_token_logps = per_token_logps = get_logps_func(model, input_ids, attention_mask, logits_to_keep)
+        # else:
+        #     ref_per_token_logps = None
+        ref_hidden_states = inputs.get("ref_per_token_logps", None)
+        # per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
+        # x - x.detach() allows for preserving gradients from x
+        advantages = inputs["advantages"]
+        # per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1)
+        # per_token_loss = -(per_token_loss - self.beta * per_token_kl)
+        # loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+        old_hidden_states = inputs.get("old_per_token_logps", None)
+
+        input_ids = input_ids[:, -logits_to_keep:]
+
+        # Get logit softcapping and logit scale
+        logit_softcapping = getattr(model.config, "final_logit_softcapping", 0) # Gemma
+        if logit_softcapping is None: logit_softcapping = 0
+        logit_scale_multiply = getattr(model.config, "logit_scale", 0) # Cohere
+        if logit_scale_multiply is None: logit_scale_multiply = 0
+        logit_scale_divide = getattr(model.config, "logits_scaling", 0) # Granite
+        if logit_scale_divide is None: logit_scale_divide = 0
+
+        if per_token_logps is not None:
+
+            if ref_hidden_states is not None:
+                ref_hidden_states = ref_hidden_states[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+            if old_hidden_states is not None:
+                old_hidden_states = old_hidden_states[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+            per_token_logps = per_token_logps[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+
+            loss, completion_length, mean_kl, delta, flat_is_ratio = grpo_compute_loss_slow(
+                ref_hidden_states,
+                per_token_logps,
+                old_hidden_states,
+                input_ids,
+                completion_mask,
+                self.beta,
+                advantages,
+                pixel_values = pixel_values,
+                image_grid_thw = image_grid_thw,
+                loss_type = self.args.loss_type,
+                importance_sampling_level = self.importance_sampling_level,
+                epsilon_low = self.epsilon_low,
+                epsilon_high = self.epsilon_high,
+                max_completion_length = self.args.max_completion_length,
+                delta = self.args.delta,
+                temperature = self.args.temperature,
+                logit_softcapping = logit_softcapping,
+                logit_scale_multiply = logit_scale_multiply,
+                logit_scale_divide = logit_scale_divide,
+                num_items_in_batch = num_items_in_batch, 
+                current_gradient_accumulation_steps = current_gradient_accumulation_steps,
+                num_processes = num_processes,
+                sampling_per_token_logps  = sampling_per_token_logps,
+            )
+        else:
+            if hasattr(self.args, "loss_type"):
+                loss, completion_length, mean_kl, delta, flat_is_ratio = grpo_accumulated_loss(
+                    trainer = self,
+                    input_ids = _input_ids,
+                    pixel_values = pixel_values,
+                    image_grid_thw = image_grid_thw,
+                    logits_to_keep = logits_to_keep,
+                    completion_mask = completion_mask,
+                    advantages = advantages,
+                    old_hidden_states = old_hidden_states,
+                    ref_hidden_states = ref_hidden_states,
+                    n_chunks = self.args.unsloth_num_chunks,
+                    loss_type = self.args.loss_type,
+                    importance_sampling_level = self.importance_sampling_level,
+                    epsilon_low = self.epsilon_low,
+                    epsilon_high = self.epsilon_high,
+                    max_completion_length = self.args.max_completion_length,
+                    delta = self.args.delta,
+                    temperature = self.args.temperature,
+                    logit_softcapping = logit_softcapping,
+                    logit_scale_multiply = logit_scale_multiply,
+                    logit_scale_divide = logit_scale_divide,
+                    attention_mask = attention_mask,
+                    num_items_in_batch = num_items_in_batch, 
+                    current_gradient_accumulation_steps = current_gradient_accumulation_steps,
+                    num_processes = num_processes,
+                    sampling_per_token_logps  = sampling_per_token_logps,
+                )
+            else:
+                # to ensure backwards compatibility with trl 0.15.2 and maybe even 0.17
+                loss, completion_length, mean_kl = grpo_accumulated_loss(
+                    trainer = self,
+                    input_ids = _input_ids,
+                    logits_to_keep = logits_to_keep,
+                    completion_mask = completion_mask,
+                    advantages = advantages,
+                    old_hidden_states = old_hidden_states,
+                    ref_hidden_states = ref_hidden_states,
+                    n_chunks = self.args.unsloth_num_chunks,
+                    temperature = self.args.temperature,
+                    logit_softcapping = logit_softcapping,
+                    logit_scale_multiply = logit_scale_multiply,
+                    logit_scale_divide = logit_scale_divide,
+                    attention_mask = attention_mask,
+                )
+            pass
+        pass
+
+        if "train" in self._metrics:
+            mode = "eval" if self.control.should_evaluate else "train"
+            self._metrics[mode]["completion_length"].append(completion_length.item())
+            self._metrics[mode]["kl"].append(mean_kl.item())
+        else:
+            self._metrics["completion_length"].append(completion_length.item())
+            self._metrics["kl"].append(mean_kl.item())
+
+        if self.use_vllm and delta is not None:
+            mean_delta = torch.mean(delta) if delta.numel() > 0 else torch.tensor(0.0, device=self.model.device)
+            max_delta = torch.max(delta) if delta.numel() > 0 else torch.tensor(0.0, device=self.model.device)
+            self._metrics[mode]["sampling/sampling_logp_difference/mean"].append(
+                self.accelerator.gather(mean_delta).mean().item()
+            )
+            self._metrics[mode]["sampling/sampling_logp_difference/max"].append(
+                self.accelerator.gather(max_delta).max().item()
+            )
+
+            min_importance_sampling_ratio = (
+                torch.min(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=self.model.device)
+            )
+            mean_importance_sampling_ratio = (
+                torch.mean(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=self.model.device)
+            )
+            max_importance_sampling_ratio = (
+                torch.max(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=self.model.device)
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/min"].append(
+                nanmin(self.accelerator.gather(min_importance_sampling_ratio)).item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/mean"].append(
+                self.accelerator.gather(mean_importance_sampling_ratio).nanmean().item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/max"].append(
+                nanmax(self.accelerator.gather(max_importance_sampling_ratio)).item()
+            )
+
+        return loss
+
+    def _compute_loss(self, model, inputs):
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+
+        # Compute the per_token_logps and the entropy at each position in the completion
+        per_token_logps, entropies = self._get_per_token_logps_and_entropies(
+            model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            compute_entropy=True,
+            pixel_values=inputs.get("pixel_values"),
+            image_grid_thw=inputs.get("image_grid_thw"),
+            pixel_attention_mask=inputs.get("pixel_attention_mask"),
+            image_sizes=inputs.get("image_sizes"),
+        )
+
+        if self.top_entropy_quantile < 1.0:
+            entropy_mask = self.get_high_entropy_mask(entropies, completion_mask, 1 - self.top_entropy_quantile)
+        else:
+            entropy_mask = None
+
+        # Compute the KL divergence between the model and the reference model
+        if self.beta != 0.0:
+            ref_per_token_logps = inputs["ref_per_token_logps"]
+            per_token_kl = (
+                torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
+            )
+
+        # Compute the loss
+        advantages = inputs["advantages"]
+        # When num_iterations == 1 and steps_per_generation <= gradient_accumulation_steps,
+        # old_per_token_logps == per_token_logps. In this case we can skip its computation
+        # (see _generate_and_score_completions) and instead use per_token_logps.detach().
+        # The exception is when using vLLM, where we always compute old_per_token_logps
+        # for importance sampling
+        old_per_token_logps = inputs.get("old_per_token_logps")
+        old_per_token_logps = per_token_logps.detach() if old_per_token_logps is None else old_per_token_logps
+
+        log_ratio = per_token_logps - old_per_token_logps
+        if self.importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif self.importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {self.importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+
+        coef_1 = torch.exp(log_importance_weights)
+        coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
+
+        # Two-sided clipping
+        if self.args.delta is not None:
+            coef_1 = torch.clamp(coef_1, max=self.args.delta)
+
+        per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+        per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        if entropy_mask is not None:
+            per_token_loss = per_token_loss * entropy_mask
+
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            per_token_loss = per_token_loss * inputs["importance_sampling_ratio"]
+
+        if self.beta != 0.0:
+            per_token_loss = per_token_loss + self.beta * per_token_kl
+
+        if self.loss_type == "grpo":
+            loss = ((per_token_loss * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)).mean()
+            loss = loss / self.current_gradient_accumulation_steps
+        elif self.loss_type == "bnpo":
+            loss = (per_token_loss * completion_mask).sum() / completion_mask.sum().clamp(min=1.0)
+            loss = loss / self.current_gradient_accumulation_steps
+        elif self.loss_type == "dr_grpo":
+            loss = (per_token_loss * completion_mask).sum() / (per_token_loss.size(0) * self.max_completion_length)
+            loss = loss / self.current_gradient_accumulation_steps
+        elif self.loss_type == "dapo":
+            normalizer = inputs["num_items_in_batch"] / self.accelerator.num_processes
+            loss = (per_token_loss * completion_mask).sum() / normalizer
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}")
+
+        # Log the metrics
+        mode = "train" if self.model.training else "eval"
+
+        completion_token_count = completion_mask.sum().clamp(min=1.0)
+
+        def masked_batch_mean(x):
+            if x.shape[1] == 1:  # when importance_sampling_level == "sequence"
+                return x.mean()
+            else:
+                return (x * completion_mask).sum() / completion_token_count
+
+        if self.beta != 0.0:
+            mean_kl = masked_batch_mean(per_token_kl)
+            self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item())
+
+        mean_entropy = masked_batch_mean(entropies)
+        self._metrics[mode]["entropy"].append(self.accelerator.gather(mean_entropy).nanmean().item())
+
+        # Compute the clipped probability ratios
+        is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages.unsqueeze(1) < 0)
+        is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (advantages.unsqueeze(1) > 0)
+        is_region_clipped = is_low_clipped | is_high_clipped
+
+        low_clip = masked_batch_mean(is_low_clipped.float())
+        high_clip = masked_batch_mean(is_high_clipped.float())
+        clip_ratio = masked_batch_mean(is_region_clipped.float())
+
+        gathered_low_clip = self.accelerator.gather(low_clip)
+        self._metrics[mode]["clip_ratio/low_mean"].append(gathered_low_clip.nanmean().item())
+        self._metrics[mode]["clip_ratio/low_min"].append(nanmin(gathered_low_clip).item())
+        gathered_high_clip = self.accelerator.gather(high_clip)
+        self._metrics[mode]["clip_ratio/high_mean"].append(gathered_high_clip.nanmean().item())
+        self._metrics[mode]["clip_ratio/high_max"].append(nanmax(gathered_high_clip).item())
+        gathered_clip_ratio = self.accelerator.gather(clip_ratio)
+        self._metrics[mode]["clip_ratio/region_mean"].append(gathered_clip_ratio.nanmean().item())
+        return loss
+
+    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys: Optional[list[str]] = None):
+        inputs = self._prepare_inputs(inputs)
+        with torch.no_grad():
+            with self.compute_loss_context_manager():
+                loss = self.compute_loss(model, inputs)
+            loss = loss.mean().detach()
+        return loss, None, None
+
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        mode = "train" if self.model.training else "eval"
+        metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()}  # average the metrics
+
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if mode == "eval":
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+
+        logs = {**logs, **metrics}
+        super().log(logs, start_time)
+        self._metrics[mode].clear()
+
+        if self.accelerator.is_main_process and self.log_completions:
+            if is_rich_available():
+                print_prompt_completions_sample(
+                    self._logs["prompt"],
+                    self._logs["completion"],
+                    self._logs["rewards"],
+                    self._logs["advantages"],
+                    self.state.global_step,
+                    self.num_completions_to_print,
+                )
+
+            if self.args.report_to and "wandb" in self.args.report_to and wandb.run is not None:
+                import pandas as pd
+
+                table = {
+                    "step": [str(self.state.global_step)] * len(self._logs["prompt"]),
+                    "prompt": self._logs["prompt"],
+                    "completion": self._logs["completion"],
+                    **self._logs["rewards"],
+                    "advantage": self._logs["advantages"],
+                }
+
+                if self._logs["image"]:
+                    table["image"] = []
+                    for img in self._logs["image"]:
+                        if img is not None:
+                            # Convert images to wandb Image objects for proper visualization
+                            table["image"].append(wandb.Image(img))
+                        else:
+                            table["image"].append(None)
+
+                df = pd.DataFrame(table)
+                if self.wandb_log_unique_prompts:
+                    df = df.drop_duplicates(subset=["prompt"])
+                wandb.log({"completions": wandb.Table(dataframe=df)})
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent(
+            """\
+            @article{shao2024deepseekmath,
+                title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+                author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+                year         = 2024,
+                eprint       = {arXiv:2402.03300},
+            }
+            """
+        )
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="GRPO",
+            trainer_citation=citation,
+            paper_title="DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
+            paper_id="2402.03300",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothGRPOTrainer(_UnslothGRPOTrainer):
+    """
+    
+Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
+paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
+Models](https://huggingface.co/papers/2402.03300).
+
+Example:
+
+```python
+from datasets import load_dataset
+from trl import GRPOTrainer
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+def reward_func(completions, **kwargs):
+    # Dummy reward function that rewards completions with more unique letters.
+    return [float(len(set(completion))) for completion in completions]
+trainer = GRPOTrainer(
+    model="Qwen/Qwen2-0.5B-Instruct",
+    reward_funcs=reward_func,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+Args:
+    model (`Union[str, PreTrainedModel]`):
+        Model to be trained. Can be either:
+
+        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+          path to a *directory* containing model weights saved using
+          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+          `args.model_init_kwargs`.
+        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+    reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
+        Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+        functions with the prompts and completions and sum the rewards. Can be either:
+
+        - A single reward function, such as:
+            - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
+            path to a *directory* containing model weights saved using
+            [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+            using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
+            keyword arguments in `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
+            - A custom reward function: The function is provided with the prompts and the generated completions,
+              plus any additional columns in the dataset. It should return a list of rewards. Custom reward
+              functions can also return `None` when the reward is not applicable to those samples. This is useful
+              for multi-task training where different reward functions apply to different types of samples. When a
+              reward function returns `None` for a sample, that reward function is excluded from the reward
+              calculation for that sample. For more details, see [Using a custom reward
+              function](#using-a-custom-reward-function).
+
+              The trainer's state is also passed to the reward function. The trainer's state is an instance of
+              [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
+              reward function's signature.
+        - A list of reward functions, where each item can independently be any of the above types. Mixing different
+        types within the list (e.g., a string model ID and a custom reward function) is allowed.
+    args ([`GRPOConfig`], *optional*, defaults to `None`):
+        Configuration for this trainer. If `None`, a default configuration is used.
+    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+        Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
+        ignored. The format of the samples can be either:
+
+        - [Standard](dataset_formats#standard): Each sample contains plain text.
+        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+          and content).
+    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
+        Processing class used to process the data. The padding side must be set to "left". If `None`, the
+        processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
+        padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
+        `tokenizer.eos_token` will be used as the default.
+    reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
+        Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+        - A single processing class: Used when `reward_funcs` contains only one reward function.
+        - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+        If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
+        `None`, the tokenizer for the model is automatically loaded using
+        [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
+        functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
+        are ignored.
+    callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
+        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+        in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+        method.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+        A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+        model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+
+    """
+    def __init__(
+        self,
+        model,
+        reward_funcs,
+        args = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        reward_processing_classes = None,
+        callbacks = None,
+        peft_config = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothGRPOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        other_metrics = []
+        if not isinstance(reward_funcs, list): _reward_funcs = [reward_funcs]
+        else: _reward_funcs = reward_funcs
+        for reward_func in _reward_funcs:
+            try:
+                reward_func_name = reward_func.__name__
+                if True:
+                    other_metrics.append(f'rewards/{reward_func_name}/mean')
+                if True:
+                    other_metrics.append(f'rewards/{reward_func_name}/std')
+                if False:
+                    other_metrics.append(f'rewards/{reward_func_name}')
+            except: pass
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('grpo_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            reward_funcs = reward_funcs,
+            args = args,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            reward_processing_classes = reward_processing_classes,
+            callbacks = callbacks,
+            peft_config = peft_config,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothIterativeSFTTrainer.py b/unsloth_compiled_cache/UnslothIterativeSFTTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..de27975bb36ea45104a2fcf9190ffdb5fae524c0
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothIterativeSFTTrainer.py
@@ -0,0 +1,1149 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.iterative_sft_trainer import (AutoModelForCausalLM, AutoTokenizer, BaseImageProcessor, Callable, DataCollator, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, DataLoader, Dataset, EvalLoopOutput, FeatureExtractionMixin, IterativeSFTConfig, IterativeSFTTrainer, Optional, PPODecorators, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, Trainer, TrainingArguments, Union, generate_model_card, get_comet_experiment_url, is_peft_available, is_wandb_available, logger, logging, os, torch, warnings, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, logger, os, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothIterativeSFTConfig(IterativeSFTConfig):
+    """
+    
+Configuration class for the [`IterativeSFTTrainer`].
+
+<Tip warning={true}>
+
+The [`IterativeSFTTrainer`] is deprecated and will be removed in version 0.24.0. Please use the [`SFTTrainer`].
+
+</Tip>
+
+This class includes only the parameters that are specific to Iterative SFT training. For a full list of training
+arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this
+class may differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    > Parameters that control the model
+
+    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
+        argument of the [`IterativeSFTTrainer`] is provided as a string.
+
+    > Parameters that control the data preprocessing
+
+    max_length (`int` or `None`, *optional*, defaults to `None`):
+        Maximum length of the tokenized sequence. Sequences longer than `max_length` are truncated.
+    truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+        The truncation mode to use, either `"keep_end"` or `"keep_start"`.
+    optimize_device_cache (`bool`, *optional*, defaults to `False`):
+        Whether to optimize accelerator cache for slightly more memory-efficient training.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        model_init_kwargs = None,
+        max_length = None,
+        truncation_mode = 'keep_end',
+        optimize_device_cache = False,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            model_init_kwargs = model_init_kwargs,
+            max_length = max_length,
+            truncation_mode = truncation_mode,
+            optimize_device_cache = optimize_device_cache,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothIterativeSFTTrainer(Trainer):
+    """
+    The IterativeSFTTrainer can be used to finetune models with methods that requires some steps between optimization.
+
+    <Tip warning={true}>
+
+    The [`IterativeSFTTrainer`] is deprecated and will be removed in version 0.24.0. Please use the [`SFTTrainer`].
+
+    </Tip>
+
+    Args:
+        model (`Union[str, PreTrainedModel]`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        args ([`IterativeSFTConfig`], *optional*, defaults to `None`):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        data_collator (`DataCollator`, *optional*):
+            Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
+            Will default to [`~transformers.default_data_collator`] if no `processing_class` is provided, an instance
+            of [`~transformers.DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or
+            tokenizer.
+        eval_dataset (`datasets.Dataset`):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+            Processing class used to process the data. If `None`, the processing class is loaded from the model's name
+            with [`~transformers.AutoTokenizer.from_pretrained`].
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+    """
+
+    _tag_names = ["trl", "iterative-sft"]
+
+    def __init__(
+        self,
+        model: Union[str, PreTrainedModel],
+        args: Optional[Union[IterativeSFTConfig, TrainingArguments]] = None,
+        data_collator: Optional[DataCollator] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
+    ):
+        warnings.warn(
+            "The `IterativeSFTTrainer` is deprecated and will be removed in version 0.24.0. Please use the "
+            "`SFTTrainer`.",
+            FutureWarning,
+        )
+
+        # Args
+        model_id = model if isinstance(model, str) else model.config._name_or_path
+        if args is None:
+            model_name = model_id.split("/")[-1]
+            args = IterativeSFTConfig(f"{model_name}-IterativeSFT")
+        elif isinstance(args, TrainingArguments) and not isinstance(args, IterativeSFTConfig):
+            dict_args = args.to_dict()
+            dict_args["hub_token"] = args.hub_token  # to_dict hides the hub_token
+            dict_args.pop("push_to_hub_token")
+            args = IterativeSFTConfig(**dict_args)
+
+        # Handle the tokenizer
+        if processing_class is None:
+            processing_class = AutoTokenizer.from_pretrained(model_id)
+
+        # Model
+        if args.model_init_kwargs is not None and not isinstance(model, str):
+            logger.warning(
+                "You passed model_init_kwargs to the `IterativeSFTConfig`, but your model is already instantiated. "
+                "The `model_init_kwargs` will be ignored."
+            )
+        if isinstance(model, str):
+            model = self._create_model_from_path(model, args)
+
+        # PEFT configuration and model wrapping
+        if is_peft_available() and isinstance(model, PeftModel):
+            self.is_peft_model = True
+        else:
+            self.is_peft_model = False
+
+        self.processing_class = processing_class
+        self.is_encoder_decoder = getattr(model.config, "is_encoder_decoder", False)
+
+        if data_collator is None:
+            if self.is_encoder_decoder:
+                self.data_collator = DataCollatorForSeq2Seq(
+                    processing_class, label_pad_token_id=-100, pad_to_multiple_of=8
+                )
+            else:
+                self.data_collator = DataCollatorForLanguageModeling(self.processing_class, mlm=False)
+        else:
+            self.data_collator = data_collator
+
+        self.max_length = args.max_length
+        self.truncation_mode = args.truncation_mode
+        self.optimize_device_cache = args.optimize_device_cache
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=self.data_collator,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        self.create_optimizer_and_scheduler(self.args.max_steps)
+
+        # prepare model, optimizer and lr_scheduler
+        self.model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
+            self.model, self.optimizer, self.lr_scheduler
+        )
+
+        self.processing_class.truncation_side = "left" if self.truncation_mode == "keep_end" else "right"
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+        PPODecorators.optimize_device_cache = self.optimize_device_cache
+
+    def _create_model_from_path(self, model_path: str, args: IterativeSFTConfig) -> PreTrainedModel:
+        """Creates a model from a path or model identifier."""
+        model_init_kwargs = args.model_init_kwargs or {}
+        return AutoModelForCausalLM.from_pretrained(model_path, **model_init_kwargs)
+
+    def prepare_model_inputs(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor):
+        if attention_mask is None:
+            attention_mask = [torch.ones_like(ids) for ids in input_ids]
+
+        if self.is_encoder_decoder:
+            input_data = self.data_collator(
+                [
+                    {"input_ids": ids, "attention_mask": att, "labels": lab}
+                    for ids, att, lab in zip(input_ids, attention_mask, labels)
+                ]
+            ).to(self.model.device)
+
+            input_data.pop("decoder_input_ids", None)  # This is directly computed inside the model
+
+            input_data["labels"][input_data["labels"] == self.processing_class.pad_token_id] = -100
+
+        else:
+            input_data = self.data_collator(
+                [{"input_ids": ids, "attention_mask": att} for ids, att in zip(input_ids, attention_mask)]
+            ).to(self.model.device)
+
+        # truncate in case the user has provided input_ids, attention_mask and labels
+        if self.max_length is not None:
+            if self.truncation_mode == "keep_start":
+                input_data = {k: v[: self.max_length] for k, v in input_data.items()}
+            elif self.truncation_mode == "keep_end":
+                input_data = {k: v[-self.max_length :] for k, v in input_data.items()}
+            else:
+                raise ValueError(f"Unknown truncation mode: {self.truncation_mode}")
+
+        return input_data
+
+    @staticmethod
+    def _step_safety_checker(
+        input_ids: list[torch.LongTensor],
+        attention_mask: list[torch.LongTensor],
+        labels: list[torch.LongTensor],
+        texts: list[str],
+        texts_labels: list[str],
+    ):
+        """
+        Check if the input data is valid for training.
+
+        Args:
+            input_ids (list[`torch.LongTensor`]):
+                List of tensors containing the input_ids
+            attention_mask (list[`torch.LongTensor`]):
+                List of tensors containing the attention_mask
+            labels (list[`torch.FloatTensor`]):
+                List of tensors containing the labels
+            texts (list[`str`]):
+                List of string containing the text input.
+            texts_labels (list[`str`]):
+                List of string containing the text labels.
+
+        Returns:
+            `tuple`: The input data.
+        """
+        if texts is None:
+            if attention_mask is None:
+                for name, tensor_list in zip(["input_ids", "labels"], [input_ids, labels]):
+                    if not isinstance(tensor_list, list):
+                        raise ValueError(f"{name} must be a list of tensors - got {type(tensor_list)}")
+                    if not isinstance(tensor_list[0], torch.Tensor):
+                        raise ValueError(f"Elements in {name} must be tensors - got {type(tensor_list[0])}")
+            else:
+                for name, tensor_list in zip(
+                    ["input_ids", "attention_mask", "labels"], [input_ids, attention_mask, labels]
+                ):
+                    if not isinstance(tensor_list, list):
+                        raise ValueError(f"{name} must be a list of tensors - got {type(tensor_list)}")
+                    if not isinstance(tensor_list[0], torch.Tensor):
+                        raise ValueError(f"Elements in {name} must be tensors - got {type(tensor_list[0])}")
+        else:
+            if not isinstance(texts, list):
+                raise ValueError(f"'text' must be a list of strings - got {type(texts)}")
+            if not isinstance(texts[0], str):
+                raise ValueError(f"Elements in 'text' must be strings - got {type(texts[0])}")
+            if texts_labels is not None:
+                if not isinstance(texts_labels, list):
+                    raise ValueError(f"'text_labels' must be a list of strings - got {type(texts_labels)}")
+                if not isinstance(texts_labels[0], str):
+                    raise ValueError(f"Elements in 'text_labels' must be strings - got {type(texts_labels[0])}")
+
+        return input_ids, attention_mask, labels, texts, texts_labels
+
+    @PPODecorators.empty_device_cache()
+    def step(
+        self,
+        input_ids: Optional[list[torch.LongTensor]] = None,
+        attention_mask: Optional[list[torch.LongTensor]] = None,
+        labels: Optional[list[torch.LongTensor]] = None,
+        texts: Optional[list[str]] = None,
+        texts_labels: Optional[list[str]] = None,
+    ):
+        """
+        Run an optimisation step given a list of input_ids, attention_mask, and labels or a list of text and
+        text_labels.
+
+        Args:
+            input_ids (list[`torch.LongTensor`]):
+                List of tensors containing the input_ids (if not provided, text will be used)
+            attention_mask (list[`torch.LongTensor`], , *optional*):
+                List of tensors containing the attention_mask
+            labels (list[`torch.FloatTensor`], *optional*):
+                List of tensors containing the labels (if set to None, will default to input_ids)
+            texts (list[`str`], *optional*):
+                List of strings containing the text input (if not provided, input_ids will directly be used)
+            texts_labels (list[`str`], *optional*):
+                List of strings containing the text labels (if set to None, will default to text)
+
+        Returns:
+            `dict[str, Any]`: A summary of the training statistics
+        """
+        self.model.train()
+
+        if self.state.global_step == 0:
+            self.tr_loss = torch.tensor(0.0).to(self.args.device)
+            self._globalstep_last_logged = self.state.global_step
+
+        if input_ids is None and texts is None:
+            raise ValueError("Step should include `input_ids` or `texts` as keyword arguments.")
+        elif input_ids is not None and texts is not None:
+            logger.warning(
+                "Both `input_ids` and `texts` argument are provided. `input_ids` will be ignored. "
+                "Please provide only one of the two.",
+            )
+
+        if labels is None and texts_labels is None and self.is_encoder_decoder:
+            raise ValueError(
+                "No 'labels' or 'text_labels' are provided. When using an encoder-decoder architecture, 'labels' or 'text_labels' must be passed."
+            )
+
+        # Convert Column to list if not already
+        input_ids = input_ids[:] if input_ids is not None else None
+        attention_mask = attention_mask[:] if attention_mask is not None else None
+        labels = labels[:] if labels is not None else None
+        texts = texts[:] if texts is not None else None
+        texts_labels = texts_labels[:] if texts_labels is not None else None
+
+        input_ids, attention_mask, labels, texts, texts_labels = self._step_safety_checker(
+            input_ids, attention_mask, labels, texts, texts_labels
+        )
+
+        if texts is not None:
+            model_inputs = self.processing_class(
+                texts, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt"
+            )
+
+            input_ids, attention_mask = model_inputs["input_ids"], model_inputs["attention_mask"]
+
+        if texts_labels is not None:
+            labels = self.processing_class(
+                texts, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt"
+            )["input_ids"]
+
+        if labels is None:
+            labels = input_ids
+
+        model_inputs = self.prepare_model_inputs(input_ids, attention_mask, labels)
+
+        model_inputs_names = list(model_inputs.keys())
+
+        batch_dict = {}
+        batch_dict.update(model_inputs)
+
+        def collator(data):
+            return_dict = dict()
+            for key in data[0]:
+                if key in ["input_ids", "attention_mask", "labels"]:
+                    return_dict[key] = torch.stack([d[key] for d in data]).to(self.model.device)
+            return return_dict
+
+        batch_data = Dataset.from_dict(batch_dict)
+        batch_data.set_format("torch")
+
+        step_dataloader = DataLoader(
+            batch_data,
+            batch_size=self.args.per_device_train_batch_size,
+            shuffle=True,
+            collate_fn=collator,
+        )
+
+        for _, batch in enumerate(step_dataloader):
+            with self.accelerator.accumulate(self.model):
+                model_inputs = {k: batch[k] for k in model_inputs_names}
+                loss = self.compute_loss(self.model, model_inputs)
+
+                if self.args.n_gpu > 1:
+                    loss = loss.mean()
+
+                tr_loss_step = loss.detach()
+
+                self.accelerator.backward(loss)
+
+                if self.accelerator.sync_gradients and self.args.max_grad_norm is not None:
+                    self.accelerator.clip_grad_norm_(
+                        self.model.parameters(),
+                        self.args.max_grad_norm,
+                    )
+
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+                if self.lr_scheduler is not None:
+                    self.lr_scheduler.step()
+
+                self.state.global_step += 1
+
+                # update stats etc
+                self.tr_loss += tr_loss_step
+
+                self._maybe_log_save_evaluate()
+
+    def _maybe_log_save_evaluate(self):
+        # check if eval is required
+        if self.args.eval_steps is not None:
+            if self.state.global_step % self.args.eval_steps == 0 and self.state.global_step != 0:
+                self.evaluate(self.eval_dataset)
+
+        # check if logging is required
+        if self.args.logging_steps is not None:
+            if self.state.global_step % self.args.logging_steps == 0 and self.state.global_step != 0:
+                logs: dict[str, float] = {}
+
+                tr_loss_scalar = self._nested_gather(self.tr_loss).mean().item()
+
+                # reset tr_loss to zero
+                self.tr_loss -= self.tr_loss
+
+                logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
+                logs["learning_rate"] = self._get_learning_rate()
+
+                self._globalstep_last_logged = self.state.global_step
+
+                self.log(logs)
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="Iterative SFT",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothIterativeSFTTrainer(_UnslothIterativeSFTTrainer):
+    """
+    
+The IterativeSFTTrainer can be used to finetune models with methods that requires some steps between optimization.
+
+<Tip warning={true}>
+
+The [`IterativeSFTTrainer`] is deprecated and will be removed in version 0.24.0. Please use the [`SFTTrainer`].
+
+</Tip>
+
+Args:
+    model (`Union[str, PreTrainedModel]`):
+        Model to be trained. Can be either:
+
+        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+          path to a *directory* containing model weights saved using
+          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+          `args.model_init_kwargs`.
+        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+    args ([`IterativeSFTConfig`], *optional*, defaults to `None`):
+        Configuration for this trainer. If `None`, a default configuration is used.
+    data_collator (`DataCollator`, *optional*):
+        Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
+        Will default to [`~transformers.default_data_collator`] if no `processing_class` is provided, an instance
+        of [`~transformers.DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or
+        tokenizer.
+    eval_dataset (`datasets.Dataset`):
+        The dataset to use for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+        Processing class used to process the data. If `None`, the processing class is loaded from the model's name
+        with [`~transformers.AutoTokenizer.from_pretrained`].
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        The optimizer and scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+        The function to use to preprocess the logits before computing the metrics.
+    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+        metric values.
+
+    """
+    def __init__(
+        self,
+        model,
+        args = None,
+        data_collator = None,
+        eval_dataset = None,
+        processing_class = None,
+        preprocess_logits_for_metrics = None,
+        compute_metrics = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothIterativeSFTConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('iterative_sft_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            args = args,
+            data_collator = data_collator,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            compute_metrics = compute_metrics,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothKTOTrainer.py b/unsloth_compiled_cache/UnslothKTOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6226aa246255dcc18933e3bc5c236ca4fefe513f
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothKTOTrainer.py
@@ -0,0 +1,2289 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.kto_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, KTOConfig, KTOTrainer, Literal, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SequentialSampler, Trainer, TrainerCallback, TrainingArguments, Union, _get_kl_dataset, _process_tokens, _tokenize, autocast, concatenate_datasets, contextmanager, create_reference_model, defaultdict, disable_dropout_in_model, generate_model_card, get_comet_experiment_url, has_length, inspect, is_comet_available, is_liger_kernel_available, is_peft_available, is_wandb_available, itemgetter, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, maybe_unpair_preference_dataset, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_deepspeed, prepare_model_for_kbit_training, random, selective_log_softmax, textwrap, torch, tqdm, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, logger, os, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothKTOConfig(KTOConfig):
+    """
+    
+Configuration class for the [`KTOTrainer`].
+
+This class includes only the parameters that are specific to KTO training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    max_length (`int` or `None`, *optional*, defaults to `1024`):
+        Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+        to use the default data collator.
+    max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+        Maximum length of the prompt. This argument is required if you want to use the default data collator.
+    max_completion_length (`int` or `None`, *optional*, defaults to `None`):
+        Maximum length of the completion. This argument is required if you want to use the default data collator
+        and your model is an encoder-decoder.
+    beta (`float`, *optional*, defaults to `0.1`):
+        Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+        reference model.
+    loss_type (`str`, *optional*, defaults to `"kto"`):
+        Type of loss to use. Possible values are:
+
+            - `"kto"`: KTO loss from the [KTO](https://huggingface.co/papers/2402.01306) paper.
+            - `"apo_zero_unpaired"`: Unpaired variant of APO-zero loss from the
+              [APO](https://huggingface.co/papers/2408.06266) paper.
+
+    desirable_weight (`float`, *optional*, defaults to `1.0`):
+        Desirable losses are weighed by this factor to counter unequal number of desirable and undesirable paris.
+    undesirable_weight (`float`, *optional*, defaults to `1.0`):
+        Undesirable losses are weighed by this factor to counter unequal number of desirable and undesirable pairs.
+    label_pad_token_id (`int`, *optional*, defaults to `-100`):
+        Label pad token id. This argument is required if you want to use the default data collator.
+    padding_value (`int` or `None`, *optional*, defaults to `None`):
+        Padding value to use. If `None`, the padding value of the tokenizer is used.
+    truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+        Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+        This argument is required if you want to use the default data collator.
+    generate_during_eval (`bool`, *optional*, defaults to `False`):
+        If `True`, generates and logs completions from both the model and the reference model to W&B or Comet
+        during evaluation.
+    is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`):
+        When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+        you need to specify if the model returned by the callable is an encoder-decoder model.
+    precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
+        Whether to precompute reference model log probabilities for training and evaluation datasets. This is
+        useful when training without the reference model to reduce the total GPU memory needed.
+    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+        string.
+    ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
+        from a string.
+    dataset_num_proc: (`int` or `None`, *optional*, defaults to `None`):
+        Number of processes to use for processing the dataset.
+    disable_dropout (`bool`, *optional*, defaults to `True`):
+        Whether to disable dropout in the model and reference model.
+    use_liger_loss (`bool`, *optional*, defaults to `False`):
+        Whether to use Liger loss. It requires liger-kernel to be installed.
+    base_model_attribute_name (`str`, *optional*, defaults to `"model"`):
+        Name of the attribute in the model that contains the base model. This is used to get the base model from
+        the model when the model does not have a `get_decoder` method in the case when `use_liger_loss` is `True`.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        max_length = 1024,
+        max_prompt_length = 512,
+        max_completion_length = None,
+        beta = 0.1,
+        loss_type = 'kto',
+        desirable_weight = 1.0,
+        undesirable_weight = 1.0,
+        label_pad_token_id = -100,
+        padding_value = None,
+        truncation_mode = 'keep_end',
+        generate_during_eval = False,
+        is_encoder_decoder = None,
+        disable_dropout = True,
+        precompute_ref_log_probs = False,
+        model_init_kwargs = None,
+        ref_model_init_kwargs = None,
+        dataset_num_proc = None,
+        use_liger_loss = False,
+        base_model_attribute_name = 'model',
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            max_length = max_length,
+            max_prompt_length = max_prompt_length,
+            max_completion_length = max_completion_length,
+            beta = beta,
+            loss_type = loss_type,
+            desirable_weight = desirable_weight,
+            undesirable_weight = undesirable_weight,
+            label_pad_token_id = label_pad_token_id,
+            padding_value = padding_value,
+            truncation_mode = truncation_mode,
+            generate_during_eval = generate_during_eval,
+            is_encoder_decoder = is_encoder_decoder,
+            disable_dropout = disable_dropout,
+            precompute_ref_log_probs = precompute_ref_log_probs,
+            model_init_kwargs = model_init_kwargs,
+            ref_model_init_kwargs = ref_model_init_kwargs,
+            dataset_num_proc = dataset_num_proc,
+            use_liger_loss = use_liger_loss,
+            base_model_attribute_name = base_model_attribute_name,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothKTOTrainer(Trainer):
+    r"""
+    Initialize KTOTrainer.
+
+    Args:
+        model (`transformers.PreTrainedModel`):
+            The model to train, preferably an `AutoModelForSequenceClassification`.
+        ref_model (`PreTrainedModelWrapper`):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        args (`KTOConfig`):
+            The arguments to use for training.
+        train_dataset (`datasets.Dataset`):
+            The dataset to use for training.
+        eval_dataset (`datasets.Dataset`):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        data_collator (`transformers.DataCollator`, *optional*, defaults to `None`):
+            The data collator to use for training. If None is specified, the default data collator
+            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        model_adapter_name (`str`, defaults to `None`):
+            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+        ref_adapter_name (`str`, defaults to `None`):
+            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+    """
+
+    _tag_names = ["trl", "kto"]
+
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module, str] = None,
+        ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
+        args: KTOConfig = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        data_collator: Optional[DataCollator] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
+        model_adapter_name: Optional[str] = None,
+        ref_adapter_name: Optional[str] = None,
+    ):
+        if type(args) is TrainingArguments:
+            raise ValueError("Please use `KTOConfig` instead TrainingArguments.")
+
+        if not isinstance(model, str) and ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must mass a copy of it, or `None` if you use peft."
+            )
+
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError("You passed model_kwargs to the KTOTrainer. But your model is already instantiated.")
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the KTOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+
+        if args.ref_model_init_kwargs is None:
+            ref_model_init_kwargs = {}
+        elif not isinstance(ref_model, str):
+            raise ValueError(
+                "You passed ref_model_kwargs to the KTOTrainer. But your ref_model is already instantiated."
+            )
+        else:
+            ref_model_init_kwargs = args.ref_model_init_kwargs
+            dtype = ref_model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the KTOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                ref_model_init_kwargs["dtype"] = dtype
+
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        if isinstance(ref_model, str):
+            ref_model = AutoModelForCausalLM.from_pretrained(ref_model, **ref_model_init_kwargs)
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it with `pip install peft` to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+
+                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+            # get peft model with the given config
+            model = model
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        if args.generate_during_eval and not (is_wandb_available() or is_comet_available()):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+
+        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+        self.model_adapter_name = model_adapter_name
+        self.ref_adapter_name = ref_adapter_name
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model or args.precompute_ref_log_probs:
+            # The `model` with adapters turned off will be used as the reference model
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(model)
+
+        if processing_class is None:
+            raise ValueError(
+                "max_length or a processing_class must be specified when using the default DPODataCollatorWithPadding"
+            )
+        if args.max_length is None:
+            logger.warning(
+                "When using DPODataCollatorWithPadding, you should set `max_length` in the KTOTrainer's init"
+                " it will be set to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        if args.max_length is not None:
+            max_length = args.max_length
+
+        if args.max_prompt_length is None:
+            logger.warning(
+                "When using DPODataCollatorWithPadding, you should set `max_prompt_length` in the KTOTrainer's init"
+                " it will be set to `128` by default, but you should do it yourself in the future.",
+            )
+            max_prompt_length = 128
+        if args.max_prompt_length is not None:
+            max_prompt_length = args.max_prompt_length
+
+        max_completion_length = None
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using DPODataCollatorWithPadding with an encoder decoder architecture, you should set `max_completion_length` in the KTOTrainer's init"
+                " it will be set to `128` by default, but you should do it yourself in the future.",
+            )
+            max_completion_length = 128
+        if args.max_completion_length is not None and self.is_encoder_decoder:
+            max_completion_length = args.max_completion_length
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                label_pad_token_id=args.label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your KTOConfig"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        self.loss_type = args.loss_type
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = args.truncation_mode
+        self.max_completion_length = max_completion_length
+        self.processing_class = processing_class
+        self.precompute_ref_log_probs = args.precompute_ref_log_probs
+
+        # Not all losses require a KL calculation
+        self.calculate_KL = True
+        if self.loss_type in ["apo_zero_unpaired"]:
+            self.calculate_KL = False
+
+        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
+        # keep track of first called to avoid computation of future calls
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+
+        # metric
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # KTO parameter
+        self.beta = args.beta
+        self.desirable_weight = args.desirable_weight
+        self.undesirable_weight = args.undesirable_weight
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+        # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in KTO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids" and "completion_input_ids". As a result,
+        # the trainer issues the warning: "Could not estimate the number of tokens of the input, floating-point
+        # operations will not be computed." To suppress this warning, we set the "estimate_tokens" key in the model's
+        # "warnings_issued" dictionary to True. This acts as a flag to indicate that the warning has already been
+        # issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        with PartialState().main_process_first():
+            # Extract the prompt if needed
+            train_dataset = train_dataset.map(
+                maybe_extract_prompt, num_proc=args.dataset_num_proc, desc="Extracting prompt from train dataset"
+            )
+            # Unpair the dataset if needed
+            train_dataset = maybe_unpair_preference_dataset(
+                train_dataset, args.dataset_num_proc, desc="Unpairing train dataset"
+            )
+            # Apply the chat template if needed
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template,
+                fn_kwargs={"tokenizer": processing_class},
+                num_proc=args.dataset_num_proc,
+                desc="Applying chat template to train dataset",
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    maybe_extract_prompt, num_proc=args.dataset_num_proc, desc="Extracting prompt from eval dataset"
+                )
+                eval_dataset = maybe_unpair_preference_dataset(
+                    eval_dataset, args.dataset_num_proc, desc="Unpairing eval dataset"
+                )
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                    desc="Applying chat template to eval dataset",
+                )
+
+            # Tokenize and prepare the training datasets
+            train_dataset = train_dataset.map(
+                _tokenize,
+                batched=True,
+                fn_kwargs={"tokenizer": self.processing_class},
+                num_proc=args.dataset_num_proc,
+                desc="Tokenizing train dataset",
+            )
+
+            fn_kwargs = {
+                "prefix": "",
+                "is_encoder_decoder": self.is_encoder_decoder,
+                "tokenizer": self.processing_class,
+                "max_length": self.max_length,
+                "truncation_mode": self.truncation_mode,
+                "label_pad_token_id": self.label_pad_token_id,
+                "max_prompt_length": self.max_prompt_length,
+                "max_completion_length": self.max_completion_length,
+            }
+
+            train_dataset = train_dataset.map(
+                _process_tokens,
+                fn_kwargs=fn_kwargs,
+                num_proc=args.dataset_num_proc,
+                desc="Processing tokenized train dataset",
+            )
+
+            # Tokenize and prepare the eval datasets
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    _tokenize,
+                    fn_kwargs={"tokenizer": self.processing_class},
+                    batched=True,
+                    num_proc=args.dataset_num_proc,
+                    desc="Tokenizing eval dataset",
+                )
+
+                eval_dataset = eval_dataset.map(
+                    _process_tokens,
+                    fn_kwargs=fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                    desc="Processing tokenized eval dataset",
+                )
+
+            # Get KL datasets if needed
+            if self.calculate_KL:
+                if args.per_device_train_batch_size <= 1:
+                    raise ValueError(
+                        "Actual (not effective) batch size must be > 1. KTO will not work properly because the KL term will be equivalent to the implied reward."
+                    )
+
+                # create pairs for estimating the KL term by flipping the matched pairs in each batch of size total_batch_size
+                # i.e., [x_1, y_1], ..., [x_n, y_n] --> [x_1, y_n], ..., [x_n, y_1] = [x'_1, y'_1], ..., [x'_n, y'_n]
+                train_kl_dataset = train_dataset.map(
+                    _get_kl_dataset,
+                    batched=True,
+                    batch_size=args.per_device_train_batch_size,
+                    num_proc=args.dataset_num_proc,
+                    desc="Extracting KL train dataset",
+                )
+
+                fn_kwargs["prefix"] = "KL_"
+                train_kl_dataset = train_kl_dataset.map(
+                    _process_tokens,
+                    fn_kwargs=fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                    remove_columns=[c for c in train_kl_dataset.column_names if c in train_dataset.column_names],
+                    desc="Processing tokenized train KL dataset",
+                )
+
+                # merge the datasets
+                train_dataset = concatenate_datasets([train_dataset, train_kl_dataset], axis=1)
+
+                if eval_dataset is not None:
+                    # Get KL dataset
+                    eval_kl_dataset = eval_dataset.map(
+                        _get_kl_dataset,
+                        batched=True,
+                        batch_size=args.per_device_train_batch_size,
+                        num_proc=args.dataset_num_proc,
+                        desc="Extracting eval KL dataset",
+                    )
+
+                    eval_kl_dataset = eval_kl_dataset.map(
+                        _process_tokens,
+                        fn_kwargs=fn_kwargs,
+                        num_proc=args.dataset_num_proc,
+                        remove_columns=[c for c in eval_kl_dataset.column_names if c in eval_dataset.column_names],
+                        desc="Processing tokenized eval KL dataset",
+                    )
+
+                    # merge the datasets
+                    eval_dataset = concatenate_datasets([eval_dataset, eval_kl_dataset], axis=1)
+
+            # calculate dataset desirability balance
+            num_desirable = max(sum(train_dataset["label"]), 1)
+            num_undesirable = max(len(train_dataset["label"]) - num_desirable, 1)  # "label" is binary
+
+            if num_desirable != num_undesirable:
+                # The lower and upper bounds come from Eq. [8] of https://huggingface.co/papers/2402.01306
+                des_weight_lower_bound = round((num_undesirable * self.undesirable_weight / num_desirable) * 1, 2)
+                des_weight_upper_bound = round((num_undesirable * self.undesirable_weight / num_desirable) * 1.33, 2)
+                und_weight_lower_bound = round((num_desirable * self.desirable_weight / num_undesirable) / 1.33, 2)
+                und_weight_upper_bound = round((num_desirable * self.desirable_weight / num_undesirable) / 1, 2)
+
+                des_weight_in_range = des_weight_lower_bound <= self.desirable_weight <= des_weight_upper_bound
+                und_weight_in_range = und_weight_lower_bound <= self.undesirable_weight <= und_weight_upper_bound
+
+                if not (des_weight_in_range or und_weight_in_range):
+                    logger.warning(
+                        "You have different amounts of desirable/positive and undesirable/negative examples but the "
+                        "weights on the desirable and undesirable losses don't seem to be in an ideal range. Based "
+                        f"on your data, we recommend EITHER "
+                        f"desirable_weight in [{des_weight_lower_bound}, {des_weight_upper_bound}] or "
+                        f"undesirable_weight in [{und_weight_lower_bound}, {und_weight_upper_bound}] (but NOT BOTH). "
+                        "See the documentation on how to optimally set these weights.",
+                    )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+        # Deepspeed Zero-3 does not support precompute_ref_log_probs
+        if self.is_deepspeed_enabled:
+            if self.accelerator.state.deepspeed_plugin.zero_stage == 3 and self.precompute_ref_log_probs:
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`."
+                )
+
+        if self.ref_model is None:
+            if not (self.is_peft_model or self.precompute_ref_log_probs):
+                raise ValueError(
+                    "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`"
+                )
+        else:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+
+        # Import Liger loss if enabled
+        if self.args.use_liger_loss:
+            if not is_liger_kernel_available():
+                raise ImportError(
+                    "You set `use_liger_loss=True` but the liger kernel is not available. "
+                    "Please install liger-kernel first: `pip install liger-kernel`"
+                )
+            if self.loss_type in ["apo_zero_unpaired"]:
+                raise ValueError(
+                    "You cannot set `loss_type='apo_zero_unpaired'` with liger-kernel."
+                    "Only KTO loss is supported with liger-kernel."
+                )
+            if self.precompute_ref_log_probs:
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with liger kernel. Please set "
+                    "`precompute_ref_log_probs=False`."
+                )
+            if self.is_peft_model or self.ref_adapter_name is not None:
+                raise ValueError(
+                    "You cannot use `use_liger_loss=True` with Peft models. Please set `use_liger_loss=False`."
+                )
+            self.kto_loss_fn = LigerFusedLinearKTOLoss(
+                ignore_index=self.label_pad_token_id, beta=self.beta, use_ref_model=(self.ref_model is not None)
+            )
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.model_adapter_name or "default")
+
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`.
+        """
+
+        if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_train_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(DataLoader(self.train_dataset, **dataloader_params))
+            reference_completion_logps = []
+            reference_KL_logps = []
+
+            for padded_batch in tqdm(iterable=data_loader, desc="Train dataset reference log probs"):
+                reference_completion_logp, reference_KL_logp = self.compute_reference_log_probs(padded_batch)
+
+                reference_completion_logp = self.accelerator.gather_for_metrics(reference_completion_logp)
+                reference_completion_logps.append(reference_completion_logp.cpu())
+
+                if self.calculate_KL:
+                    reference_KL_logp = self.accelerator.gather_for_metrics(reference_KL_logp)
+                    reference_KL_logps.append(reference_KL_logp.cpu())
+
+            self.train_dataset = self.train_dataset.add_column(
+                name="reference_logps", column=torch.cat(reference_completion_logps).float().numpy()
+            )
+
+            if self.calculate_KL:
+                self.train_dataset = self.train_dataset.add_column(
+                    name="reference_KL_logps", column=torch.cat(reference_KL_logps).float().numpy()
+                )
+
+            self._precomputed_train_ref_log_probs = True
+
+        return super().get_train_dataloader()
+
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`.
+
+        Args:
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
+                by the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_eval_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params))
+
+            reference_completion_logps = []
+            reference_KL_logps = []
+
+            for padded_batch in tqdm(iterable=data_loader, desc="Eval dataset reference log probs"):
+                reference_completion_logp, reference_KL_logp = self.compute_reference_log_probs(padded_batch)
+
+                reference_completion_logp = self.accelerator.gather_for_metrics(reference_completion_logp)
+                reference_completion_logps.append(reference_completion_logp.cpu())
+
+                if self.calculate_KL:
+                    reference_KL_logp = self.accelerator.gather_for_metrics(reference_KL_logp)
+                    reference_KL_logps.append(reference_KL_logp.cpu())
+
+            eval_dataset = eval_dataset.add_column(
+                name="reference_logps", column=torch.cat(reference_completion_logps).float().numpy()
+            )
+            if self.calculate_KL:
+                eval_dataset = eval_dataset.add_column(
+                    name="reference_KL_logps", column=torch.cat(reference_KL_logps).float().numpy()
+                )
+
+            # Save calculated reference_chosen_logps and reference_rejected_logps to the eval_dataset for subsequent runs
+            if self.eval_dataset is not None:
+                self.eval_dataset = eval_dataset
+            self._precomputed_eval_ref_log_probs = True
+
+        return super().get_eval_dataloader(eval_dataset=eval_dataset)
+
+    def compute_reference_log_probs(self, padded_batch: dict) -> dict:
+        """Computes log probabilities of the reference model for a single padded batch of a KTO specific dataset."""
+        with torch.no_grad():
+            if self.ref_model is None:
+                with self.null_ref_context():
+                    if self.is_encoder_decoder:
+                        completion_logits = self.model(
+                            padded_batch["prompt_input_ids"],
+                            attention_mask=padded_batch["prompt_attention_mask"],
+                            decoder_input_ids=padded_batch.get("completion_decoder_input_ids"),
+                            labels=padded_batch["completion_labels"],
+                        ).logits
+
+                        if self.calculate_KL:
+                            KL_logits = self.model(
+                                padded_batch["KL_prompt_input_ids"],
+                                attention_mask=padded_batch["KL_prompt_attention_mask"],
+                                decoder_input_ids=padded_batch.get("KL_completion_decoder_input_ids"),
+                                labels=padded_batch["KL_completion_labels"],
+                            ).logits
+                    else:
+                        completion_logits = self.model(
+                            padded_batch["completion_input_ids"],
+                            attention_mask=padded_batch["completion_attention_mask"],
+                        ).logits
+
+                        if self.calculate_KL:
+                            KL_logits = self.model(
+                                padded_batch["KL_completion_input_ids"],
+                                attention_mask=padded_batch["KL_completion_attention_mask"],
+                            ).logits
+            else:
+                if self.is_encoder_decoder:
+                    completion_logits = self.ref_model(
+                        padded_batch["prompt_input_ids"],
+                        attention_mask=padded_batch["prompt_attention_mask"],
+                        decoder_input_ids=padded_batch.get("completion_decoder_input_ids"),
+                        labels=padded_batch["completion_labels"],
+                    ).logits
+
+                    if self.calculate_KL:
+                        KL_logits = self.ref_model(
+                            padded_batch["KL_prompt_input_ids"],
+                            attention_mask=padded_batch["KL_prompt_attention_mask"],
+                            decoder_input_ids=padded_batch.get("KL_completion_decoder_input_ids"),
+                            labels=padded_batch["KL_completion_labels"],
+                        ).logits
+                else:
+                    completion_logits = self.ref_model(
+                        padded_batch["completion_input_ids"], attention_mask=padded_batch["completion_attention_mask"]
+                    ).logits
+
+                    if self.calculate_KL:
+                        KL_logits = self.ref_model(
+                            padded_batch["KL_completion_input_ids"],
+                            attention_mask=padded_batch["KL_completion_attention_mask"],
+                        ).logits
+
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            padded_batch["completion_labels"],
+            average_log_prob=False,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        if self.calculate_KL:
+            KL_logps = self.get_batch_logps(
+                KL_logits,
+                padded_batch["KL_completion_labels"],
+                average_log_prob=False,
+                is_encoder_decoder=self.is_encoder_decoder,
+                label_pad_token_id=self.label_pad_token_id,
+            )
+        else:
+            KL_logps = None
+
+        return completion_logps, KL_logps
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits:
+                Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
+                ignored. Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            label_pad_token_id:
+                The label value to ignore when computing log probabilities.
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model. If True, the labels are not shifted and the logits are
+                assumed to already be aligned with the labels. If False, the labels are shifted to the right by one
+                position, and the logits are assumed to be aligned with the shifted labels.
+
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        else:
+            # Fixes end-dec RuntimeError
+            labels = labels.clone()
+
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == label_pad_token_id] = 0
+
+        per_token_logps = selective_log_softmax(logits, labels)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+
+    def forward(
+        self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        KL_logps = self._compute_kl_logps(model, batch)
+
+        model_kwargs = (
+            {
+                "labels": batch["completion_labels"],
+                "decoder_input_ids": batch.get("completion_decoder_input_ids"),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        outputs = model(
+            batch["completion_input_ids"],
+            attention_mask=batch["completion_attention_mask"],
+            **model_kwargs,
+        )
+        completion_logits = outputs.logits
+
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            batch["completion_labels"],
+            average_log_prob=False,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        if completion_logps.shape[0] != len(batch["label"]):
+            raise ValueError(
+                "There is a mismatch between the number of examples in this batch and the number of "
+                "examples for which an output sequence was predicted."
+            )
+
+        chosen_idx = [i for i in range(completion_logps.shape[0]) if batch["label"][i] is True]
+        rejected_idx = [i for i in range(completion_logps.shape[0]) if batch["label"][i] is False]
+
+        chosen_logps = completion_logps[chosen_idx, ...]
+        rejected_logps = completion_logps[rejected_idx, ...]
+
+        chosen_logits = completion_logits[chosen_idx, ...]
+        rejected_logits = completion_logits[rejected_idx, ...]
+
+        if self.aux_loss_enabled:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, KL_logps, outputs.aux_loss)
+        else:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, KL_logps)
+
+    def kto_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        policy_KL_logps: torch.FloatTensor,
+        reference_chosen_logps: torch.FloatTensor,
+        reference_rejected_logps: torch.FloatTensor,
+        reference_KL_logps: torch.FloatTensor,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute the KTO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (num(chosen) in batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (num(rejected) in batch_size,)
+            policy_KL_logps: Log probabilities of the policy model for the KL responses. Shape: (batch_size,)
+            reference_chosen_logps:
+                Log probabilities of the reference model for the chosen responses. Shape: (num(chosen) in batch_size,)
+            reference_rejected_logps:
+                Log probabilities of the reference model for the rejected responses. Shape: (num(rejected) in
+                batch_size,)
+            reference_KL_logps: Log probabilities of the reference model for the KL responses. Shape: (batch_size,)
+
+        Returns:
+            A tuple of four tensors: (losses, chosen_rewards, rejected_rewards, KL). The losses tensor contains the KTO
+            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
+            the chosen and rejected responses, respectively. The KL tensor contains the detached KL divergence estimate
+            between the policy and reference models.
+        """
+        if self.calculate_KL:
+            kl = (policy_KL_logps - reference_KL_logps).mean().detach()
+            kl = self.accelerator.gather_for_metrics(kl).mean().clamp(min=0)
+        else:
+            kl = torch.zeros(1).to(policy_chosen_logps.device)
+
+        # Chosen losses
+        if policy_chosen_logps.shape[0] != 0 or reference_chosen_logps.shape[0] != 0:
+            chosen_logratios = policy_chosen_logps - reference_chosen_logps
+
+            if self.loss_type == "kto":
+                # Eqn (7) of the KTO paper (https://huggingface.co/papers/2402.01306)
+                chosen_losses = 1 - F.sigmoid(self.beta * (chosen_logratios - kl))
+            elif self.loss_type == "apo_zero_unpaired":
+                # Unpaired variant of Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+                # Use this loss when you believe the chosen outputs are better than your model's default output
+                chosen_losses = 1 - F.sigmoid(self.beta * chosen_logratios)
+
+            chosen_rewards = self.beta * chosen_logratios.detach()
+
+        else:
+            # lists can't be empty -- if they are, then accelerate.gather will hang
+            chosen_losses = torch.Tensor([]).to(self.accelerator.device)
+            chosen_rewards = torch.Tensor([]).to(self.accelerator.device)
+
+        # Rejected losses
+        if policy_rejected_logps.shape[0] != 0 or reference_rejected_logps.shape[0] != 0:
+            rejected_logratios = policy_rejected_logps - reference_rejected_logps
+
+            if self.loss_type == "kto":
+                rejected_losses = 1 - F.sigmoid(self.beta * (kl - rejected_logratios))
+            elif self.loss_type == "apo_zero_unpaired":
+                rejected_losses = F.sigmoid(self.beta * rejected_logratios)
+
+            rejected_rewards = self.beta * rejected_logratios.detach()
+        else:
+            # lists can't be empty -- if they are, then accelerate.gather will hang
+            rejected_losses = torch.Tensor([]).to(self.accelerator.device)
+            rejected_rewards = torch.Tensor([]).to(self.accelerator.device)
+
+        losses = torch.cat(
+            (self.desirable_weight * chosen_losses, self.undesirable_weight * rejected_losses),
+            0,
+        )
+
+        return losses, chosen_rewards, rejected_rewards, kl
+
+    def _compute_kl_logps(self, model, batch):
+        """Compute KL log probabilities for a given batch."""
+        KL_logps = None
+        if self.calculate_KL:
+            if self.is_encoder_decoder:
+                KL_model_kwargs = {
+                    "input_ids": batch["KL_prompt_input_ids"],
+                    "attention_mask": batch["KL_prompt_attention_mask"],
+                    "labels": batch["KL_completion_labels"],
+                    "decoder_input_ids": batch.get("KL_completion_decoder_input_ids"),
+                }
+            else:
+                KL_model_kwargs = {
+                    "input_ids": batch["KL_completion_input_ids"],
+                    "attention_mask": batch["KL_completion_attention_mask"],
+                }
+
+            with torch.no_grad():
+                KL_logits = model(**KL_model_kwargs).logits
+
+            KL_logps = self.get_batch_logps(
+                KL_logits,
+                batch["KL_completion_labels"],
+                average_log_prob=False,
+                is_encoder_decoder=self.is_encoder_decoder,
+                label_pad_token_id=self.label_pad_token_id,
+            )
+        return KL_logps
+
+    def _compute_loss_liger(self, model, batch):
+        """
+        Compute the KTO loss using the Liger-Kernel's LigerFusedLinearKTOLoss.
+
+        Args:
+            model:
+                The policy model used for generating log probabilities and outputs. It could be an encoder-decoder
+                model or a regular language model.
+            batch: A dictionary containing the input data and labels for the batch.
+
+        Returns:
+            A dictionary containing the following keys:
+                - "loss": The computed KTO loss for the batch.
+                - "chosen_logits_sum": Sum of the logits for the chosen responses from the policy model.
+                - "rejected_logits_sum": Sum of the logits for the rejected responses from the policy model.
+                - "chosen_logps": Log probabilities of the chosen responses from the policy model.
+                - "rejected_logps": Log probabilities of the rejected responses from the policy model.
+                - "chosen_rewards": Rewards for the chosen responses.
+                - "rejected_rewards": Rewards for the rejected responses.
+                - "kl": The KL divergence between the policy and reference models (detached).
+
+            If auxiliary loss is enabled, the dictionary will also include:
+                - "aux_loss": The auxiliary loss from the model outputs.
+        """
+        policy_KL_logps = self._compute_kl_logps(model, batch)
+        reference_KL_logps = self._compute_kl_logps(self.ref_model, batch)
+        if self.calculate_KL:
+            kl = (policy_KL_logps - reference_KL_logps).mean().detach()
+            kl = self.accelerator.gather_for_metrics(kl).mean().clamp(min=0)
+        else:
+            kl = torch.zeros(1).to(self.accelerator.device)
+
+        model_kwargs = (
+            {
+                "labels": batch["completion_labels"],
+                "decoder_input_ids": batch.get("completion_decoder_input_ids"),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        if self.is_encoder_decoder:
+            # 1. Get encoder outputs
+            encoder_outputs = model.get_encoder()(
+                batch["completion_input_ids"],
+                attention_mask=batch["completion_attention_mask"],
+                return_dict=True,
+                **model_kwargs,
+            )
+            # 2. Get decoder outputs
+            outputs = model.get_decoder()(
+                input_ids=model_kwargs["decoder_input_ids"],
+                encoder_hidden_states=encoder_outputs.last_hidden_state,
+                use_cache=False,
+                **model_kwargs,
+            )
+            # 1. Get reference encoder outputs
+            ref_encoder_outputs = self.ref_model.get_encoder()(
+                batch["completion_input_ids"],
+                attention_mask=batch["completion_attention_mask"],
+                return_dict=True,
+                **model_kwargs,
+            )
+            # 2. Get reference decoder outputs
+            ref_outputs = self.ref_model.get_decoder()(
+                input_ids=model_kwargs["decoder_input_ids"],
+                encoder_hidden_states=ref_encoder_outputs.last_hidden_state,
+                use_cache=False,
+                **model_kwargs,
+            )
+        else:
+            # skip the lm head and get the last hidden state
+            if hasattr(model, "get_decoder") and model.get_decoder() is not None:
+                base_model = model.get_decoder()
+            else:
+                base_attr = getattr(model, "base_model_prefix", self.args.base_model_attribute_name)
+                base_model = getattr(model, base_attr, model)
+            outputs = base_model(
+                batch["completion_input_ids"],
+                attention_mask=batch["completion_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+
+            # reference model
+            if hasattr(self.ref_model, "get_decoder") and self.ref_model.get_decoder() is not None:
+                ref_base_model = self.ref_model.get_decoder()
+            else:
+                ref_attr = getattr(self.ref_model, "base_model_prefix", self.args.base_model_attribute_name)
+                ref_base_model = getattr(self.ref_model, ref_attr, self.ref_model)
+            ref_outputs = ref_base_model(
+                batch["completion_input_ids"],
+                attention_mask=batch["completion_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+        lm_head = model.get_output_embeddings()
+        ref_lm_head = self.ref_model.get_output_embeddings()
+
+        (
+            loss,
+            (
+                chosen_logps_sum,
+                rejected_logps_sum,
+                chosen_logits_sum,
+                rejected_logits_sum,
+                chosen_rewards_sum,
+                rejected_rewards_sum,
+            ),
+        ) = self.kto_loss_fn(
+            _input=outputs.last_hidden_state[:, :-1] if not self.is_encoder_decoder else outputs.last_hidden_state,
+            lin_weight=lm_head.weight,
+            target=batch["completion_labels"][:, 1:],
+            bias=lm_head.bias if hasattr(lm_head, "bias") else None,
+            preference_labels=torch.tensor(batch["label"], dtype=torch.bool).to(self.accelerator.device),
+            ref_input=ref_outputs.last_hidden_state[:, :-1]
+            if not self.is_encoder_decoder
+            else outputs.last_hidden_state,
+            ref_weight=ref_lm_head.weight,
+            ref_bias=ref_lm_head.bias if hasattr(lm_head, "bias") else None,
+            kl=kl,
+        )
+
+        output = {
+            "loss": loss,
+            "chosen_logits_sum": chosen_logits_sum,
+            "rejected_logits_sum": rejected_logits_sum,
+            "chosen_logps_sum": chosen_logps_sum,
+            "rejected_logps_sum": rejected_logps_sum,
+            "chosen_rewards_sum": chosen_rewards_sum,
+            "rejected_rewards_sum": rejected_rewards_sum,
+            "kl": kl,
+        }
+        if self.aux_loss_enabled:
+            output["aux_loss"] = outputs.aux_loss
+
+        return output
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, Union[list, torch.LongTensor]],
+    ):
+        """Compute the KTO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        batch = {k: (v.to(self.accelerator.device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
+
+        labels = torch.tensor(batch["label"])
+        num_chosen = labels.sum().to(self.accelerator.device)
+        num_rejected = (len(labels) - num_chosen).to(self.accelerator.device)
+
+        if self.args.use_liger_loss:
+            model_output = self._compute_loss_liger(model, batch)
+            losses = model_output["loss"]
+            policy_chosen_logits = model_output["chosen_logits_sum"]
+            policy_rejected_logits = model_output["rejected_logits_sum"]
+            policy_chosen_logps = model_output["chosen_logps_sum"]
+            policy_rejected_logps = model_output["rejected_logps_sum"]
+            chosen_rewards = model_output["chosen_rewards_sum"]
+            rejected_rewards = model_output["rejected_rewards_sum"]
+            kl = model_output["kl"]
+            if self.aux_loss_enabled:
+                aux_loss = model_output["aux_loss"]
+        else:
+            forward_output = self.forward(model, batch)
+            (
+                policy_chosen_logps,
+                policy_rejected_logps,
+                policy_chosen_logits,
+                policy_rejected_logits,
+                policy_KL_logps,
+            ) = forward_output[:5]
+            if self.aux_loss_enabled:
+                aux_loss = forward_output[5]
+
+            # if reference_logps in batch use them, otherwise use the reference model
+            if "reference_logps" in batch:
+                chosen_idx = [i for i in range(batch["reference_logps"].shape[0]) if batch["label"][i] is True]
+                rejected_idx = [i for i in range(batch["reference_logps"].shape[0]) if batch["label"][i] is False]
+
+                reference_chosen_logps = batch["reference_logps"][chosen_idx, ...]
+                reference_rejected_logps = batch["reference_logps"][rejected_idx, ...]
+                if self.calculate_KL:
+                    reference_KL_logps = batch["reference_KL_logps"]
+                else:
+                    reference_KL_logps = None
+            else:
+                with torch.no_grad():
+                    if self.ref_model is None:
+                        with self.null_ref_context():
+                            (
+                                reference_chosen_logps,
+                                reference_rejected_logps,
+                                _,
+                                _,
+                                reference_KL_logps,
+                            ) = self.forward(self.model, batch)[:5]
+                    else:
+                        (
+                            reference_chosen_logps,
+                            reference_rejected_logps,
+                            _,
+                            _,
+                            reference_KL_logps,
+                        ) = self.forward(self.ref_model, batch)[:5]
+
+            losses, chosen_rewards, rejected_rewards, kl = self.kto_loss(
+                policy_chosen_logps,
+                policy_rejected_logps,
+                policy_KL_logps,
+                reference_chosen_logps,
+                reference_rejected_logps,
+                reference_KL_logps,
+            )
+
+        metrics["kl"] = kl.item()
+
+        all_num_chosen = self.accelerator.gather_for_metrics(num_chosen).sum().item()
+        all_num_rejected = self.accelerator.gather_for_metrics(num_rejected).sum().item()
+
+        if all_num_chosen > 0:
+            metrics["rewards/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(chosen_rewards.nansum()).nansum().item()
+            )
+            metrics["logps/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(policy_chosen_logps.nansum()).nansum().item()
+            )
+            metrics["logits/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(policy_chosen_logits.nansum()).nansum().item()
+            )
+            metrics["count/chosen"] = all_num_chosen
+
+        if all_num_rejected > 0:
+            metrics["rewards/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(rejected_rewards.nansum()).nansum().item()
+            )
+            metrics["logps/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(policy_rejected_logps.nansum()).nansum().item()
+            )
+            metrics["logits/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(policy_rejected_logits.nansum()).nansum().item()
+            )
+            metrics["count/rejected"] = all_num_rejected
+
+        loss = losses.nanmean()
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+    def compute_loss(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs)
+
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+        # force log the metrics
+        if self.accelerator.is_main_process:
+            self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def _get_train_sampler(self, dataset: Optional[Dataset] = None) -> Optional[torch.utils.data.Sampler]:
+        if dataset is None:
+            dataset = self.train_dataset
+        if dataset is None or not has_length(dataset):
+            return None
+        return SequentialSampler(dataset)
+
+    def generate_from_model_and_ref(self, model, batch: dict[str, torch.LongTensor]) -> tuple[str, str]:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+
+            # if reference_output in batch use that otherwise use the reference model
+            if "reference_output" in batch:
+                reference_output = batch["reference_output"]
+            else:
+                if self.ref_model is None:
+                    with self.null_ref_context():
+                        reference_output = self.model.generate(
+                            input_ids=batch["prompt_input_ids"],
+                            attention_mask=batch["prompt_attention_mask"],
+                            max_length=self.max_length,
+                            do_sample=True,
+                            pad_token_id=self.processing_class.pad_token_id,
+                        )
+                else:
+                    reference_output = self.ref_model.generate(
+                        input_ids=batch["prompt_input_ids"],
+                        attention_mask=batch["prompt_attention_mask"],
+                        max_length=self.max_length,
+                        do_sample=True,
+                        pad_token_id=self.processing_class.pad_token_id,
+                    )
+
+        policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id)
+        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
+
+        reference_output = pad_to_length(reference_output, self.max_length, self.processing_class.pad_token_id)
+        reference_output_decoded = self.processing_class.batch_decode(reference_output, skip_special_tokens=True)
+
+        return policy_output_decoded, reference_output_decoded
+
+    def prediction_step(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[list[str]] = None,
+    ):
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs)
+
+        # force log the metrics
+        if self.accelerator.is_main_process:
+            self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {}
+        if "logits/chosen_sum" in metrics:
+            logits_dict["eval_logits/chosen"] = metrics["logits/chosen_sum"]
+        if "logits/rejected_sum" in metrics:
+            logits_dict["eval_logits/rejected"] = metrics["logits/rejected_sum"]
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[list[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            target_labels = torch.tensor(random_batch["label"], dtype=torch.bool, device=self.accelerator.device)
+            target_indices = torch.where(~target_labels)[0]
+            target_batch = {
+                "prompt_input_ids": random_batch["prompt_input_ids"][target_indices],
+                "prompt_attention_mask": random_batch["prompt_attention_mask"][target_indices],
+                "prompt": itemgetter(*target_indices)(random_batch["prompt"]),
+            }
+            policy_output_decoded, ref_output_decoded = self.generate_from_model_and_ref(self.model, target_batch)
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy", "Ref Model"],
+                data=[
+                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
+                    for prompt, pol, ref in zip(target_batch["prompt"], policy_output_decoded, ref_output_decoded)
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float` or `None`, *optional*, defaults to `None`):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # train metrics should have no prefix, eval should have 'eval_'
+        prefix = "eval_" if train_eval == "eval" else ""
+        # accumulate average metrics from sums and lengths
+        for split in ["chosen", "rejected"]:
+            if f"count/{split}" in self._stored_metrics[train_eval]:
+                count_sum = torch.Tensor(self._stored_metrics[train_eval][f"count/{split}"]).sum().item()
+                for metric in ["rewards", "logps", "logits"]:
+                    logs[f"{prefix}{metric}/{split}"] = (
+                        torch.Tensor(self._stored_metrics[train_eval][f"{metric}/{split}_sum"]).sum().item()
+                        / count_sum
+                    )
+                    # delete obsolete metric
+                    del self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
+                del self._stored_metrics[train_eval][f"count/{split}"]
+        # calculate reward margin
+        if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs:
+            logs[f"{prefix}rewards/margins"] = logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"]
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[f"{prefix}{key}"] = torch.Tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @article{ethayarajh2024kto,
+            title        = {{KTO: Model Alignment as Prospect Theoretic Optimization}},
+            author       = {Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela},
+            year         = 2024,
+            eprint       = {arXiv:2402.01306},
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="KTO",
+            trainer_citation=citation,
+            paper_title="KTO: Model Alignment as Prospect Theoretic Optimization",
+            paper_id="2402.01306",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothKTOTrainer(_UnslothKTOTrainer):
+    """
+    
+Initialize KTOTrainer.
+
+Args:
+    model (`transformers.PreTrainedModel`):
+        The model to train, preferably an `AutoModelForSequenceClassification`.
+    ref_model (`PreTrainedModelWrapper`):
+        Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+        and loss. If no reference model is provided, the trainer will create a reference model with the same
+        architecture as the model to be optimized.
+    args (`KTOConfig`):
+        The arguments to use for training.
+    train_dataset (`datasets.Dataset`):
+        The dataset to use for training.
+    eval_dataset (`datasets.Dataset`):
+        The dataset to use for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+        Processing class used to process the data. If provided, will be used to automatically process the inputs
+        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+        reuse the fine-tuned model.
+    data_collator (`transformers.DataCollator`, *optional*, defaults to `None`):
+        The data collator to use for training. If None is specified, the default data collator
+        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+        sequences in the batch, given a dataset of paired sequences.
+    model_init (`Callable[[], transformers.PreTrainedModel]`):
+        The model initializer to use for training. If None is specified, the default model initializer will be
+        used.
+    callbacks (`list[transformers.TrainerCallback]`):
+        The callbacks to use for training.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        The optimizer and scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+        The function to use to preprocess the logits before computing the metrics.
+    peft_config (`dict`, defaults to `None`):
+        The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+        a PEFT model.
+    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+        metric values.
+    model_adapter_name (`str`, defaults to `None`):
+        Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+    ref_adapter_name (`str`, defaults to `None`):
+        Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+
+    """
+    def __init__(
+        self,
+        model = None,
+        ref_model = None,
+        args = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        data_collator = None,
+        model_init = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        peft_config = None,
+        compute_metrics = None,
+        model_adapter_name = None,
+        ref_adapter_name = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothKTOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('kto_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            ref_model = ref_model,
+            args = args,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            data_collator = data_collator,
+            model_init = model_init,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            peft_config = peft_config,
+            compute_metrics = compute_metrics,
+            model_adapter_name = model_adapter_name,
+            ref_adapter_name = ref_adapter_name,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothNashMDTrainer.py b/unsloth_compiled_cache/UnslothNashMDTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cb41b8ce12b59ccc8dbda7ab14ce02d3d27977e
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothNashMDTrainer.py
@@ -0,0 +1,1285 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.nash_md_trainer import (Any, BaseImageProcessor, BasePairwiseJudge, Callable, Dataset, EvalPrediction, F, FeatureExtractionMixin, GeometricMixtureWrapper, IterableDataset, NashMDConfig, NashMDTrainer, OnlineDPOTrainer, OptimizerNames, Optional, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SIMPLE_CHAT_TEMPLATE, TrainerCallback, Union, empty_cache, generate_model_card, get_comet_experiment_url, get_reward, is_conversational, is_peft_available, is_wandb_available, jinja2, maybe_apply_chat_template, nn, os, selective_log_softmax, textwrap, torch, truncate_right, unwrap_model_for_generation)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothNashMDConfig(NashMDConfig):
+    """
+    
+Configuration class for the [`NashMDTrainer`].
+
+Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
+
+Parameters:
+    mixture_coef (`float` or `list[float]`, *optional*, defaults to `0.5`):
+        Logit mixture coefficient for the model and reference model. If a list of floats is provided then the
+        mixture coefficient is selected for each new epoch and the last coefficient is used for the rest of the
+        epochs.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        reward_model_path = None,
+        judge = None,
+        max_new_tokens = 64,
+        max_length = 512,
+        temperature = 0.9,
+        top_p = 1.0,
+        top_k = None,
+        min_p = None,
+        repetition_penalty = 1.0,
+        generation_kwargs = {},
+        use_transformers_paged = False,
+        cache_implementation = None,
+        missing_eos_penalty = None,
+        loss_type = 'sigmoid',
+        disable_dropout = True,
+        use_vllm = False,
+        vllm_model_impl = 'vllm',
+        vllm_guided_decoding_regex = None,
+        vllm_gpu_memory_utilization = 0.55,
+        vllm_mode = 'colocate',
+        vllm_server_base_url = None,
+        vllm_server_host = '0.0.0.0',
+        vllm_server_port = 8000,
+        vllm_server_timeout = 240.0,
+        vllm_tensor_parallel_size = 1,
+        ds3_gather_for_generation = True,
+        model_init_kwargs = None,
+        reward_weights = None,
+        dataset_num_proc = None,
+        gpu_memory_utilization = None,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        if temperature <= 0:
+            raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.')
+        elif temperature >= 10:
+            raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.')
+        
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            reward_model_path = reward_model_path,
+            judge = judge,
+            max_new_tokens = max_new_tokens,
+            max_length = max_length,
+            temperature = temperature,
+            top_p = top_p,
+            top_k = top_k,
+            min_p = min_p,
+            repetition_penalty = repetition_penalty,
+            generation_kwargs = generation_kwargs,
+            use_transformers_paged = use_transformers_paged,
+            cache_implementation = cache_implementation,
+            missing_eos_penalty = missing_eos_penalty,
+            loss_type = loss_type,
+            disable_dropout = disable_dropout,
+            use_vllm = use_vllm,
+            vllm_model_impl = vllm_model_impl,
+            vllm_guided_decoding_regex = vllm_guided_decoding_regex,
+            vllm_gpu_memory_utilization = vllm_gpu_memory_utilization,
+            vllm_mode = vllm_mode,
+            vllm_server_base_url = vllm_server_base_url,
+            vllm_server_host = vllm_server_host,
+            vllm_server_port = vllm_server_port,
+            vllm_server_timeout = vllm_server_timeout,
+            vllm_tensor_parallel_size = vllm_tensor_parallel_size,
+            ds3_gather_for_generation = ds3_gather_for_generation,
+            model_init_kwargs = model_init_kwargs,
+            reward_weights = reward_weights,
+            dataset_num_proc = dataset_num_proc,
+            gpu_memory_utilization = gpu_memory_utilization,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothNashMDTrainer(OnlineDPOTrainer):
+    r"""
+    Initialize NashMDTrainer as a subclass of [`OnlineDPOConfig`].
+
+    Args:
+        model (`transformers.PreTrainedModel`):
+            The model to train, preferably an `AutoModelForCausalLM`.
+        ref_model (`PreTrainedModelWrapper`):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        reward_funcs (`transformers.PreTrainedModel`):
+            The reward model to score completions with, preferably an `AutoModelForSequenceClassification`.
+        judge (`BasePairwiseJudge`):
+            The judge to use for pairwise comparison of model completions.
+        args (`NashMDConfig`):
+            The NashMD config arguments to use for training.
+        data_collator (`transformers.DataCollator`):
+            The data collator to use for training. If None is specified, the default data collator
+            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset (`datasets.Dataset`):
+            The dataset to use for training.
+        eval_dataset (`datasets.Dataset`):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        peft_config (`dict`):
+            The peft config to use for training.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+
+    .. deprecated:: 0.22.0
+        The following parameters are deprecated and will be removed in a future version:
+
+        * `reward_model`: Use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.
+    """
+
+    _tag_names = ["trl", "nash-md"]
+
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module] = None,
+        ref_model: Union[PreTrainedModel, nn.Module] = None,
+        reward_funcs: Union[PreTrainedModel, nn.Module, None] = None,
+        judge: Optional[BasePairwiseJudge] = None,
+        args: Optional[NashMDConfig] = None,
+        data_collator: Optional[Callable] = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        # Deprecated parameters
+        reward_model: Optional[Union[PreTrainedModel, nn.Module]] = None,
+    ) -> None:
+        super().__init__(
+            model=model,
+            ref_model=ref_model,
+            reward_funcs=reward_funcs,
+            judge=judge,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=processing_class,
+            peft_config=peft_config,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+            reward_model=reward_model,
+        )
+
+        self._mixture_coef = self.args.mixture_coef
+
+        # Overwrite the stats dictionary to include NashMD specific statistics
+        self.stats = {
+            # Remove "non_score_reward", "rlhf_reward", "scores_margin"
+            # Add "mixture_coef"
+            "loss/kl": [],
+            "objective/entropy": [],
+            "loss/score": [],
+            "rewards/probabilities": [],
+            "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
+            "val/model_contain_eos_token": [],
+            "val/ref_contain_eos_token": [],
+            "beta": [],
+            "mixture_coef": [],
+        }
+        if self.reward_funcs is not None:
+            if len(self.reward_funcs) != 1:
+                raise ValueError("NashMDTrainer only supports one reward function/model.")
+            self.reward_funcs = self.reward_funcs[0]
+            self.stats["rewards/chosen"] = []
+            self.stats["rewards/rejected"] = []
+
+    @property
+    def mixture_coef(self):
+        if isinstance(self._mixture_coef, list):
+            epoch = self.state.epoch
+            return self._mixture_coef[epoch] if epoch < len(self._mixture_coef) else self._mixture_coef[-1]
+        else:
+            return self._mixture_coef
+
+    def _generate_completions(self, model, prompts):
+        # Generate completions from the policy model.
+        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_policy_for_gen_ctx:
+            model_output = unwrapped_policy_for_gen_ctx.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        # Get the DDP/FSDP unwrapped version of the main model.
+        # This will be the policy model for GeometricMixtureWrapper (PEFT adapters active if PEFT is used).
+        policy_model_for_gmw = self.accelerator.unwrap_model(model)
+
+        # Determine the correct reference model for GeometricMixtureWrapper.
+        # This also needs to be DDP/FSDP unwrapped.
+        ref_model_for_gmw: torch.nn.Module
+        if self.ref_model is None:
+            # No explicit ref_model is provided.
+            # Use the base of the main `model` if it's a PEFT model.
+            # policy_model_for_gmw is already DDP-unwrapped.
+            if is_peft_available() and isinstance(policy_model_for_gmw, PeftModel):
+                ref_model_for_gmw = policy_model_for_gmw.get_base_model()
+            else:
+                # Not a PEFT model (or PEFT not available), or already a base model.
+                # Use the DDP-unwrapped policy model itself as the reference.
+                ref_model_for_gmw = policy_model_for_gmw
+        else:
+            # An explicit ref_model is provided. Unwrap it for DDP/FSDP.
+            ref_model_for_gmw = self.accelerator.unwrap_model(self.ref_model)
+
+        # Both models given to GeometricMixtureWrapper (policy_model_for_gmw and ref_model_for_gmw) are DDP-unwrapped.
+        with torch.no_grad():  # Ensure no_grad context for mixture model generation
+            mixture_model = GeometricMixtureWrapper(
+                model=policy_model_for_gmw,
+                ref_model=ref_model_for_gmw,
+                generation_config=self.generation_config,
+                mixture_coef=self.mixture_coef,
+                device=self.accelerator.device,
+            )
+
+            mixture_output = mixture_model.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        return model_output, mixture_output
+
+    def _process_completions(self, model_output, mixture_output, prompts):
+        context_length = prompts["input_ids"].shape[1]
+
+        # Process model completions
+        model_completion_ids = model_output[:, context_length:]
+        model_completion_ids, model_completion_mask = truncate_right(
+            model_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        model_data = {
+            "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+
+        # Process reference model completions
+        mixture_completion_ids = mixture_output[:, context_length:]
+        mixture_completion_ids, mixture_completion_mask = truncate_right(
+            mixture_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        mixture_data = {
+            "input_ids": torch.cat((prompts["input_ids"], mixture_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], mixture_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+
+        return model_data, mixture_data
+
+    def _compute_rewards(self, model_data, mixture_data, context_length):
+        with torch.no_grad():
+            _, model_scores, _ = get_reward(
+                self.reward_funcs, model_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+            _, mixture_scores, _ = get_reward(
+                self.reward_funcs, mixture_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+
+        # Apply EOS penalty if needed
+        if self.args.missing_eos_penalty is not None:
+            model_contain_eos = torch.any(model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            mixture_contain_eos = torch.any(mixture_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            model_scores[~model_contain_eos] -= self.args.missing_eos_penalty
+            mixture_scores[~mixture_contain_eos] -= self.args.missing_eos_penalty
+
+        return model_scores, mixture_scores
+
+    def _compute_judge(self, model_data, mixture_data, context_length):
+        prompts = model_data["raw"]
+        model_data_completions = self.processing_class.batch_decode(
+            model_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        model_data_completions = [completion.strip() for completion in model_data_completions]
+
+        mixture_data_completions = self.processing_class.batch_decode(
+            mixture_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        mixture_data_completions = [completion.strip() for completion in mixture_data_completions]
+        if is_conversational({"prompt": prompts[0]}):
+            model_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in model_data_completions
+            ]
+            environment = jinja2.Environment()
+            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+            prompts = [template.render(messages=message) for message in prompts]
+            model_data_completions = [template.render(messages=completion) for completion in model_data_completions]
+
+            mixture_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in mixture_data_completions
+            ]
+            mixture_data_completions = [
+                template.render(messages=completion) for completion in mixture_data_completions
+            ]
+
+        probability = self.judge.judge(
+            prompts,
+            list(zip(model_data_completions, mixture_data_completions)),
+            return_scores=True,
+        )
+        return torch.tensor(probability, device=model_data["input_ids"].device)
+
+    def _compute_logprobs(self, model, model_data, context_length):
+        def compute_logprobs_for_data(m, data):
+            output = m(data["input_ids"], attention_mask=data["attention_mask"])
+            logits = output.logits[:, context_length - 1 : -1]
+            token_logprobs = selective_log_softmax(logits, data["input_ids"][:, context_length:])
+            return token_logprobs
+
+        # Compute logprobs for model completions under the model
+        model_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+
+        # Compute logprobs of model completions under the reference model
+        with torch.no_grad():
+            if self.ref_model is None:
+                with model.disable_adapter():
+                    ref_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+            else:
+                ref_logprobs_model_data = compute_logprobs_for_data(self.ref_model, model_data)
+
+        # Mask padding tokens
+        model_padding_mask = model_data["attention_mask"][:, context_length:] == 0
+        model_logprobs_model_data = model_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+        ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+
+        return (model_logprobs_model_data, ref_logprobs_model_data)
+
+    def _compute_losses(
+        self,
+        model_logprobs_model_data,
+        ref_logprobs_model_data,
+        probability,
+    ):
+        # reinforce score where 0.5 is a control variate
+        score = (probability - 0.5) * model_logprobs_model_data.sum(1)
+
+        # kl divergence via reinforce
+        with torch.no_grad():
+            log_ratio = model_logprobs_model_data - ref_logprobs_model_data
+            kl_div_log = log_ratio.sum(1)
+        kl_div_loss = (log_ratio * model_logprobs_model_data).sum(1)
+
+        # final loss
+        loss = self.beta * kl_div_loss - score
+
+        return loss.mean(), score, kl_div_log
+
+    def _log_statistics(
+        self,
+        model_data,
+        mixture_data,
+        model_logprobs_model_data,
+        ref_logprobs_model_data,
+        probability,
+        score,
+        kl_div,
+        context_length,
+        model_scores=None,
+        mixture_scores=None,
+    ):
+        # Helper function to gather and compute mean
+        def gather_mean(tensor):
+            return self.accelerator.gather_for_metrics(tensor).mean().item()
+
+        # Log score
+        self.stats["loss/score"].append(gather_mean(score))
+        # Log KL divergence
+        self.stats["loss/kl"].append(gather_mean(kl_div))
+
+        # Log logprobs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+
+        self.stats["logps/chosen"].append(gather_mean(model_logprobs_model_data_sum))
+        self.stats["logps/rejected"].append(gather_mean(ref_logprobs_model_data_sum))
+
+        # Log rewards
+        if self.reward_funcs is not None:
+            self.stats["rewards/chosen"].append(gather_mean(model_scores))
+            self.stats["rewards/rejected"].append(gather_mean(mixture_scores))
+
+        # Log probabilities
+        self.stats["rewards/probabilities"].append(gather_mean(probability))
+
+        # Calculate entropy for model data
+        entropy_model_data = -model_logprobs_model_data.sum(1)
+        self.stats["objective/entropy"].append(gather_mean(entropy_model_data))
+
+        # Calculate margins
+        margin = model_logprobs_model_data_sum - ref_logprobs_model_data_sum
+        self.stats["rewards/margins"].append(gather_mean(margin))
+
+        # Calculate accuracy
+        accuracy = (margin > 0).float()
+        self.stats["rewards/accuracies"].append(gather_mean(accuracy))
+
+        # Log EOS token statistics
+        model_eos = (model_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        mixture_eos = (mixture_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float()))
+        self.stats["val/ref_contain_eos_token"].append(gather_mean(mixture_eos.float()))
+
+        # Log beta and mixture coef
+        self.stats["beta"].append(self.beta)
+        self.stats["mixture_coef"].append(self.mixture_coef)
+
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+    ) -> torch.Tensor:
+        model.train()
+
+        # Apply chat template and tokenize the input
+        batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
+        inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
+        inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
+        inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
+        inputs = self.data_collator(inputs)
+
+        # need the prompt_ only
+        inputs = self._prepare_inputs(inputs)
+        context_length = inputs["prompt_input_ids"].shape[1]
+        prompts = {
+            "input_ids": inputs["prompt_input_ids"],
+            "attention_mask": inputs["prompt_attention_mask"],
+            "raw": prompts,
+        }
+        del inputs
+
+        # Sample completions from both the model and the reference model
+        model_output, mixture_output = self._generate_completions(model, prompts)
+
+        # Process model completions
+        model_data, mixture_data = self._process_completions(model_output, mixture_output, prompts)
+
+        # Compute rewards
+        if self.reward_funcs is not None:
+            model_scores, mixture_scores = self._compute_rewards(model_data, mixture_data, context_length)
+            # probability of the model data vs the mixture data
+            probability = F.sigmoid(model_scores - mixture_scores)
+        else:
+            model_scores, mixture_scores = None, None
+            probability = self._compute_judge(model_data, mixture_data, context_length)
+
+        # Compute logprobs
+        model_logprobs_model_data, ref_logprobs_model_data = self._compute_logprobs(model, model_data, context_length)
+
+        # Compute loss
+        loss, score, kl_div = self._compute_losses(model_logprobs_model_data, ref_logprobs_model_data, probability)
+
+        # Log everything
+        self._log_statistics(
+            model_data,
+            mixture_data,
+            model_logprobs_model_data.detach(),
+            ref_logprobs_model_data,
+            probability,
+            score.detach(),
+            kl_div.detach(),
+            context_length,
+            model_scores,
+            mixture_scores,
+        )
+
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            empty_cache()
+
+        kwargs = {}
+        # For LOMO optimizers you need to explicitly use the learning rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+
+        if self.use_apex:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            self.accelerator.backward(loss, **kwargs)
+
+        return loss.detach() / self.args.gradient_accumulation_steps
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @inproceedings{munos2024nash,
+            title        = {{Nash Learning from Human Feedback}},
+            author       = {R{\'{e}}mi Munos and Michal Valko and Daniele Calandriello and Mohammad Gheshlaghi Azar and Mark Rowland and Zhaohan Daniel Guo and Yunhao Tang and Matthieu Geist and Thomas Mesnard and C{\\^{o}}me Fiegel and Andrea Michi and Marco Selvi and Sertan Girgin and Nikola Momchev and Olivier Bachem and Daniel J. Mankowitz and Doina Precup and Bilal Piot},
+            year         = 2024,
+            booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
+            publisher    = {OpenReview.net},
+            url          = {https://openreview.net/forum?id=Y5AmNYiyCQ}
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="Nash-MD",
+            trainer_citation=citation,
+            paper_title="Nash Learning from Human Feedback",
+            paper_id="2312.00886",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothNashMDTrainer(_UnslothNashMDTrainer):
+    """
+    
+Initialize NashMDTrainer as a subclass of [`OnlineDPOConfig`].
+
+Args:
+    model (`transformers.PreTrainedModel`):
+        The model to train, preferably an `AutoModelForCausalLM`.
+    ref_model (`PreTrainedModelWrapper`):
+        Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+        and loss. If no reference model is provided, the trainer will create a reference model with the same
+        architecture as the model to be optimized.
+    reward_funcs (`transformers.PreTrainedModel`):
+        The reward model to score completions with, preferably an `AutoModelForSequenceClassification`.
+    judge (`BasePairwiseJudge`):
+        The judge to use for pairwise comparison of model completions.
+    args (`NashMDConfig`):
+        The NashMD config arguments to use for training.
+    data_collator (`transformers.DataCollator`):
+        The data collator to use for training. If None is specified, the default data collator
+        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+        sequences in the batch, given a dataset of paired sequences.
+    train_dataset (`datasets.Dataset`):
+        The dataset to use for training.
+    eval_dataset (`datasets.Dataset`):
+        The dataset to use for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+        Processing class used to process the data. If provided, will be used to automatically process the inputs
+        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+        reuse the fine-tuned model.
+    peft_config (`dict`):
+        The peft config to use for training.
+    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+        metric values.
+    callbacks (`list[transformers.TrainerCallback]`):
+        The callbacks to use for training.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        The optimizer and scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+        The function to use to preprocess the logits before computing the metrics.
+
+.. deprecated:: 0.22.0
+    The following parameters are deprecated and will be removed in a future version:
+
+    * `reward_model`: Use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.
+
+    """
+    def __init__(
+        self,
+        model = None,
+        ref_model = None,
+        reward_funcs = None,
+        judge = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        peft_config = None,
+        compute_metrics = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        reward_model = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothNashMDConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('nash_md_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            ref_model = ref_model,
+            reward_funcs = reward_funcs,
+            judge = judge,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            peft_config = peft_config,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            reward_model = reward_model,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
diff --git a/unsloth_compiled_cache/UnslothORPOTrainer.py b/unsloth_compiled_cache/UnslothORPOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5f59af66c69e1fb3c1ddba9e897e5bd4ac0b7a5
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothORPOTrainer.py
@@ -0,0 +1,1791 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.orpo_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Literal, ORPOConfig, ORPOTrainer, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, Trainer, TrainerCallback, Union, add_bos_token_if_needed, add_eos_token_if_needed, autocast, defaultdict, disable_dropout_in_model, generate_model_card, get_comet_experiment_url, inspect, is_comet_available, is_peft_available, is_torch_fx_proxy, is_torch_xla_available, is_wandb_available, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_model_for_kbit_training, random, selective_log_softmax, textwrap, torch, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, logger, os, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothORPOConfig(ORPOConfig):
+    """
+    
+Configuration class for the [`ORPOTrainer`].
+
+This class includes only the parameters that are specific to ORPO training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    max_length (`int` or `None`, *optional*, defaults to `1024`):
+        Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+        to use the default data collator.
+    max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+        Maximum length of the prompt. This argument is required if you want to use the default data collator.
+    max_completion_length (`int` or `None`, *optional*, defaults to `None`):
+        Maximum length of the completion. This argument is required if you want to use the default data collator
+        and your model is an encoder-decoder.
+    beta (`float`, *optional*, defaults to `0.1`):
+        Parameter controlling the relative ratio loss weight in the ORPO loss. In the
+        [paper](https://huggingface.co/papers/2403.07691), it is denoted by λ. In the
+        [code](https://github.com/xfactlab/orpo), it is denoted by `alpha`.
+    disable_dropout (`bool`, *optional*, defaults to `True`):
+        Whether to disable dropout in the model.
+    label_pad_token_id (`int`, *optional*, defaults to `-100`):
+        Label pad token id. This argument is required if you want to use the default data collator.
+    padding_value (`int` or `None`, *optional*, defaults to `None`):
+        Padding value to use. If `None`, the padding value of the tokenizer is used.
+    truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+        Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+        This argument is required if you want to use the default data collator.
+    generate_during_eval (`bool`, *optional*, defaults to `False`):
+        If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
+    is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`):
+        When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+        you need to specify if the model returned by the callable is an encoder-decoder model.
+    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+        string.
+    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        Number of processes to use for processing the dataset.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        max_length = 1024,
+        max_prompt_length = 512,
+        max_completion_length = None,
+        beta = 0.1,
+        disable_dropout = True,
+        label_pad_token_id = -100,
+        padding_value = None,
+        truncation_mode = 'keep_end',
+        generate_during_eval = False,
+        is_encoder_decoder = None,
+        model_init_kwargs = None,
+        dataset_num_proc = None,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            max_length = max_length,
+            max_prompt_length = max_prompt_length,
+            max_completion_length = max_completion_length,
+            beta = beta,
+            disable_dropout = disable_dropout,
+            label_pad_token_id = label_pad_token_id,
+            padding_value = padding_value,
+            truncation_mode = truncation_mode,
+            generate_during_eval = generate_during_eval,
+            is_encoder_decoder = is_encoder_decoder,
+            model_init_kwargs = model_init_kwargs,
+            dataset_num_proc = dataset_num_proc,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothORPOTrainer(Trainer):
+    r"""
+    Initialize ORPOTrainer.
+
+    Args:
+        model (`transformers.PreTrainedModel`):
+            The model to train, preferably an `AutoModelForSequenceClassification`.
+        args (`ORPOConfig`):
+            The ORPO config arguments to use for training.
+        data_collator (`transformers.DataCollator`):
+            The data collator to use for training. If None is specified, the default data collator
+            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset (`datasets.Dataset`):
+            The dataset to use for training.
+        eval_dataset (`datasets.Dataset`):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+    """
+
+    _tag_names = ["trl", "orpo"]
+
+    def __init__(
+        self,
+        model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
+        args: Optional[ORPOConfig] = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
+    ):
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError("You passed model_kwargs to the ORPOTrainer. But your model is already instantiated.")
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the ORPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+
+                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+            # get peft model with the given config
+            model = model
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        if args.generate_during_eval and not (is_wandb_available() or is_comet_available()):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+
+        if self.is_encoder_decoder:
+            self.decoder_start_token_id = model.config.decoder_start_token_id
+            self.pad_token_id = model.config.pad_token_id
+
+        if processing_class is None:
+            raise ValueError("processing_class must be specified to tokenize a ORPO dataset.")
+        if args.max_length is None:
+            logger.warning(
+                "`max_length` is not set in the ORPOConfig's init"
+                " it will default to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        else:
+            max_length = args.max_length
+        if args.max_prompt_length is None:
+            logger.warning(
+                "`max_prompt_length` is not set in the ORPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            max_prompt_length = 128
+        else:
+            max_prompt_length = args.max_prompt_length
+
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using an encoder decoder architecture, you should set `max_completion_length` in the ORPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            self.max_completion_length = 128
+        else:
+            self.max_completion_length = args.max_completion_length
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                label_pad_token_id=args.label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = args.truncation_mode
+        self.processing_class = processing_class
+
+        self.beta = args.beta
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in ORPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
+        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        with PartialState().main_process_first():
+            # Extract the prompt if needed, and apply the chat template if needed
+            train_dataset = train_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc
+            )
+            train_dataset = train_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                )
+                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+    def build_tokenized_answer(self, prompt, answer):
+        """
+        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
+        b)[len(enc(a)):]`. Reference:
+            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+        """
+
+        full_tokenized = self.processing_class(prompt + answer, add_special_tokens=False)
+        prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)["input_ids"]
+
+        answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
+        answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :]
+
+        # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+        full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])
+
+        # Prepare input tokens for token by token comparison
+        full_input_ids = np.array(full_tokenized["input_ids"])
+
+        if len(full_input_ids) != len(full_concat_input_ids):
+            raise ValueError("Prompt input ids and answer input ids should have the same length.")
+
+        # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+        # can be merged together when tokenizing prompt+answer. This could result
+        # on the last token from the prompt being different when tokenized on its own
+        # vs when done as prompt+answer.
+        response_token_ids_start_idx = len(prompt_input_ids)
+
+        # If tokenized prompt is different than both prompt+answer, then it means the
+        # last token has changed due to merging.
+        if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]:
+            response_token_ids_start_idx -= 1
+
+        prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
+        prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx]
+
+        if len(prompt_input_ids) != len(prompt_attention_mask):
+            raise ValueError("Prompt input ids and attention mask should have the same length.")
+
+        answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
+        answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]
+
+        return dict(
+            prompt_input_ids=prompt_input_ids,
+            prompt_attention_mask=prompt_attention_mask,
+            input_ids=answer_input_ids,
+            attention_mask=answer_attention_mask,
+        )
+
+    def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> dict:
+        """Tokenize a single row from a ORPO specific dataset.
+
+        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
+        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
+        we truncate the chosen/rejected.
+
+        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
+        of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens.
+        """
+        batch = {}
+        prompt = feature["prompt"]
+        chosen = feature["chosen"]
+        rejected = feature["rejected"]
+
+        if not self.is_encoder_decoder:
+            # Check issues below for more details
+            #  1. https://github.com/huggingface/trl/issues/907
+            #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+            #  3. https://github.com/LianjiaTech/BELLE/issues/337
+
+            if not isinstance(prompt, str):
+                raise ValueError(f"prompt should be an str but got {type(prompt)}")
+            prompt_tokens = self.processing_class(prompt, add_special_tokens=False)
+            prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
+
+            if not isinstance(chosen, str):
+                raise ValueError(f"chosen should be an str but got {type(chosen)}")
+            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
+
+            if not isinstance(rejected, str):
+                raise ValueError(f"rejected should be an str but got {type(rejected)}")
+            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
+
+            # Last prompt token might get merged by tokenizer and
+            # it should not be included for generation if that happens
+            prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"])
+
+            chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"])
+            rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"])
+            prompt_len_input_ids = min(chosen_prompt_len_input_ids, rejected_prompt_len_input_ids)
+
+            for k, v in prompt_tokens.items():
+                prompt_tokens[k] = v[:prompt_len_input_ids]
+
+            # Make sure prompts only have one different token at most an
+            # and length only differs by 1 at most
+            num_diff_tokens = sum(
+                [a != b for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"])]
+            )
+            num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids)
+            if num_diff_tokens > 1 or num_diff_len > 1:
+                raise ValueError(
+                    "Chosen and rejected prompt_input_ids might only differ on the "
+                    "last token due to tokenizer merge ops."
+                )
+
+            # add BOS token to head of prompt. Avoid adding if it's already there
+            prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed(
+                self.processing_class.bos_token_id,
+                prompt_len_input_ids,
+                prompt_tokens,
+                chosen_prompt_len_input_ids,
+                chosen_tokens,
+                rejected_prompt_len_input_ids,
+                rejected_tokens,
+            )
+
+            # add EOS token to end of answer. Avoid adding if it's already there
+            chosen_tokens, rejected_tokens = add_eos_token_if_needed(
+                self.processing_class.eos_token_id, chosen_tokens, rejected_tokens
+            )
+
+            longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"]))
+
+            # if combined sequence is too long, truncate the prompt
+            for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    if self.truncation_mode == "keep_start":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][: self.max_prompt_length]
+                    elif self.truncation_mode == "keep_end":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][-self.max_prompt_length :]
+                    else:
+                        raise ValueError(f"Unknown truncation mode: {self.truncation_mode}")
+
+            # if that's still too long, truncate the response
+            for answer_tokens in [chosen_tokens, rejected_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    for k in ["input_ids", "attention_mask"]:
+                        answer_tokens[k] = answer_tokens[k][: self.max_length - self.max_prompt_length]
+
+            # Create labels
+            chosen_sequence_tokens = {
+                k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"]
+            }
+            rejected_sequence_tokens = {
+                k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"]
+            }
+            chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
+            chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [
+                self.label_pad_token_id
+            ] * len(chosen_tokens["prompt_input_ids"])
+            rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:]
+            rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [
+                self.label_pad_token_id
+            ] * len(rejected_tokens["prompt_input_ids"])
+
+            for k, toks in {
+                "chosen_": chosen_sequence_tokens,
+                "rejected_": rejected_sequence_tokens,
+                "": prompt_tokens,
+            }.items():
+                for type_key, tokens in toks.items():
+                    if type_key == "token_type_ids":
+                        continue
+                    batch[f"{k}{type_key}"] = tokens
+
+        else:
+            chosen_tokens = self.processing_class(
+                chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
+            )
+            rejected_tokens = self.processing_class(
+                rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
+            )
+            prompt_tokens = self.processing_class(
+                prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True
+            )
+
+            batch["chosen_labels"] = chosen_tokens["input_ids"]
+            batch["rejected_labels"] = rejected_tokens["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokens["input_ids"]
+            batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
+
+            if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
+                batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["rejected_labels"])
+                )
+                batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["chosen_labels"])
+                )
+
+        if is_torch_xla_available():
+            # Pad the sequences to global max_length to avoid TorchXLA recompilation
+            for k in batch:
+                if "labels" in k or self.is_encoder_decoder:
+                    pad_value = self.label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = self.padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                batch[k] = batch[k] + [pad_value] * (self.max_length - len(batch[k]))
+        return batch
+
+    @staticmethod
+    def concatenated_inputs(
+        batch: dict[str, Union[list, torch.LongTensor]],
+        is_encoder_decoder: bool = False,
+        label_pad_token_id: int = -100,
+        padding_value: int = 0,
+        device: Optional[torch.device] = None,
+    ) -> dict[str, torch.LongTensor]:
+        """Concatenate the chosen and rejected inputs into a single tensor.
+
+        Args:
+            batch:
+                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
+                of shape (batch_size, sequence_length).
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model.
+            label_pad_token_id:
+                The label pad token id.
+            padding_value:
+                The padding value to use for the concatenated inputs_ids.
+            device:
+                The device for the concatenated inputs.
+
+        Returns:
+            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
+        """
+        concatenated_batch = {}
+
+        if is_encoder_decoder:
+            max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1])
+        else:
+            max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
+
+        for k in batch:
+            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("chosen", "concatenated")
+                concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value)
+        for k in batch:
+            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("rejected", "concatenated")
+                concatenated_batch[concatenated_key] = torch.cat(
+                    (
+                        concatenated_batch[concatenated_key],
+                        pad_to_length(batch[k], max_length, pad_value=pad_value),
+                    ),
+                    dim=0,
+                ).to(device=device)
+
+        if is_encoder_decoder:
+            concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device)
+            concatenated_batch["concatenated_attention_mask"] = (
+                batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
+            )
+
+        return concatenated_batch
+
+    def odds_ratio_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute ORPO's odds ratio (OR) loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the ORPO
+            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
+            the chosen and rejected responses, respectively. The log odds ratio of the chosen responses over the
+            rejected responses ratio for logging purposes. The `log(sigmoid(log_odds_chosen))` for logging purposes.
+        """
+
+        # Derived from Eqs. (4) and (7) from https://huggingface.co/papers/2403.07691 by using log identities and exp(log(P(y|x)) = P(y|x)
+        log_odds = (policy_chosen_logps - policy_rejected_logps) - (
+            torch.log1p(-torch.exp(policy_chosen_logps)) - torch.log1p(-torch.exp(policy_rejected_logps))
+        )
+        ratio = F.logsigmoid(log_odds)
+        losses = self.beta * ratio
+
+        chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
+        rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach()
+
+        return losses, chosen_rewards, rejected_rewards, torch.mean(ratio), torch.mean(log_odds)
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
+                ignored. Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            label_pad_token_id: The label pad token id.
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
+
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels = torch.where(labels == label_pad_token_id, 0, labels)
+
+        per_token_logps = selective_log_softmax(logits, labels)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+
+    def concatenated_forward(
+        self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+        len_chosen = batch["chosen_labels"].shape[0]
+
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(concatenated_batch["concatenated_labels"]),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        outputs = model(
+            concatenated_batch["concatenated_input_ids"],
+            attention_mask=concatenated_batch["concatenated_attention_mask"],
+            use_cache=False,
+            **model_kwargs,
+        )
+        all_logits = outputs.logits
+
+        def cross_entropy_loss(logits, labels):
+            if not self.is_encoder_decoder:
+                # Shift so that tokens < n predict n
+                logits = logits[..., :-1, :].contiguous()
+                labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            logits = logits.view(-1, logits.shape[-1])
+            labels = labels.view(-1)
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+            return loss
+
+        if self.is_encoder_decoder:
+            labels = concatenated_batch["concatenated_labels"].clone()
+        else:
+            labels = concatenated_batch["concatenated_input_ids"].clone()
+            attention_mask = concatenated_batch["concatenated_attention_mask"]
+            labels = torch.where(attention_mask == 1, labels, self.label_pad_token_id)
+        # orpo chosen nll loss is computed over the full prompt and response
+        chosen_nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
+
+        all_logps = self.get_batch_logps(
+            all_logits,
+            concatenated_batch["concatenated_labels"],
+            average_log_prob=True,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        chosen_logps = all_logps[:len_chosen]
+        rejected_logps = all_logps[len_chosen:]
+
+        if not self.is_encoder_decoder:
+            chosen_logits = all_logits[:len_chosen, :-1, :]
+            rejected_logits = all_logits[len_chosen:, :-1, :]
+        else:
+            chosen_logits = all_logits[:len_chosen]
+            rejected_logits = all_logits[len_chosen:]
+
+        if self.aux_loss_enabled:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_nll_loss, outputs.aux_loss)
+
+        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_nll_loss)
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, Union[list, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+
+        forward_output = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = forward_output[:5]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[5]
+
+        losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = self.odds_ratio_loss(
+            policy_chosen_logps, policy_rejected_logps
+        )
+        # full ORPO loss
+        loss = policy_nll_loss - losses.mean()
+
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean()
+        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean()
+        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean()
+        metrics[f"{prefix}rewards/margins"] = self.accelerator.gather_for_metrics(
+            chosen_rewards - rejected_rewards
+        ).mean()
+        metrics[f"{prefix}logps/rejected"] = self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean()
+        metrics[f"{prefix}logps/chosen"] = self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean()
+        metrics[f"{prefix}logits/rejected"] = self.accelerator.gather_for_metrics(
+            policy_rejected_logits.detach().mean()
+        ).mean()
+        metrics[f"{prefix}logits/chosen"] = self.accelerator.gather_for_metrics(
+            policy_chosen_logits.detach().mean()
+        ).mean()
+        metrics[f"{prefix}nll_loss"] = self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean()
+        metrics[f"{prefix}log_odds_ratio"] = self.accelerator.gather_for_metrics(log_odds_ratio).detach().mean()
+        metrics[f"{prefix}log_odds_chosen"] = self.accelerator.gather_for_metrics(log_odds_chosen).detach().mean()
+        if is_torch_xla_available():
+            xm.mark_step()  # needed because .item() calls
+        for k, v in metrics.items():
+            metrics[k] = v.item()
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+    def compute_loss(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
+
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+
+        policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id)
+        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
+
+        return policy_output_decoded
+
+    def prediction_step(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[list[str]] = None,
+    ):
+        if not self.use_dpo_data_collator:
+            logger.warning(
+                "prediction_step is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than "
+                "DPODataCollatorWithPadding - you might see unexpected behavior. Alternatively, you can implement your own prediction_step method if you are using a custom data collator"
+            )
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval")
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[list[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded = self.generate_from_model(self.model, random_batch)
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy"],
+                data=[
+                    [prompt, pol[len(prompt) :]] for prompt, pol in zip(random_batch["prompt"], policy_output_decoded)
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float` or `None`, *optional*, defaults to `None`):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    def _shift_right(self, input_ids):
+        if self.decoder_start_token_id is None:
+            raise ValueError(
+                "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), self.decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = self.decoder_start_token_id
+
+        if self.pad_token_id is None:
+            raise ValueError("model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id)
+
+        return shifted_input_ids
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @article{hong2024orpo,
+            title        = {{ORPO: Monolithic Preference Optimization without Reference Model}},
+            author       = {Jiwoo Hong and Noah Lee and James Thorne},
+            year         = 2024,
+            eprint       = {arXiv:2403.07691}
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="ORPO",
+            trainer_citation=citation,
+            paper_title="ORPO: Monolithic Preference Optimization without Reference Model",
+            paper_id="2403.07691",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothORPOTrainer(_UnslothORPOTrainer):
+    """
+    
+Initialize ORPOTrainer.
+
+Args:
+    model (`transformers.PreTrainedModel`):
+        The model to train, preferably an `AutoModelForSequenceClassification`.
+    args (`ORPOConfig`):
+        The ORPO config arguments to use for training.
+    data_collator (`transformers.DataCollator`):
+        The data collator to use for training. If None is specified, the default data collator
+        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+        sequences in the batch, given a dataset of paired sequences.
+    train_dataset (`datasets.Dataset`):
+        The dataset to use for training.
+    eval_dataset (`datasets.Dataset`):
+        The dataset to use for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+        Processing class used to process the data. If provided, will be used to automatically process the inputs
+        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+        reuse the fine-tuned model.
+    model_init (`Callable[[], transformers.PreTrainedModel]`):
+        The model initializer to use for training. If None is specified, the default model initializer will be
+        used.
+    callbacks (`list[transformers.TrainerCallback]`):
+        The callbacks to use for training.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        The optimizer and scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+        The function to use to preprocess the logits before computing the metrics.
+    peft_config (`dict`, defaults to `None`):
+        The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+        a PEFT model.
+    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+        metric values.
+
+    """
+    def __init__(
+        self,
+        model = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        model_init = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        peft_config = None,
+        compute_metrics = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothORPOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('orpo_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            model_init = model_init,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            peft_config = peft_config,
+            compute_metrics = compute_metrics,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothOnlineDPOTrainer.py b/unsloth_compiled_cache/UnslothOnlineDPOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2306bf2979e6a94198c72cc7c936d58238cc0e8
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothOnlineDPOTrainer.py
@@ -0,0 +1,2403 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.online_dpo_trainer import (Any, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, BasePairwiseJudge, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalPrediction, F, FSDP, GenerationConfig, IterableDataset, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, OnlineDPOConfig, OnlineDPOTrainer, OptimizerNames, Optional, Path, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RewardFunc, SIMPLE_CHAT_TEMPLATE, Trainer, TrainerCallback, Union, VLLMClient, apply_chat_template, broadcast_object_list, create_reference_model, disable_dropout_in_model, empty_cache, gather_object, generate_model_card, get_comet_experiment_url, is_conversational, is_flash_attn_2_available, is_peft_model, is_vllm_available, is_wandb_available, jinja2, logger, logging, maybe_apply_chat_template, nn, nullcontext, os, pad, prepare_deepspeed, prepare_peft_model, profiling_context, re, seed_worker, textwrap, torch, truncate_right, unwrap_model_for_generation, version, warnings, wraps, F, apply_chat_template, is_conversational, re, F, FSDP, is_peft_model, nn, nullcontext, os, re, version, F, Optional, PreTrainedModel, Trainer, logger, os, re, torch, F, FSDP, nn, os, re, F, FSDP, nn, re, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+def vLLMSamplingParams(**kwargs):
+    from vllm import SamplingParams
+    sampling_params = SamplingParams(**kwargs)
+    sampling_params._set_kwargs = kwargs
+    return sampling_params
+@dataclass
+class UnslothOnlineDPOConfig(OnlineDPOConfig):
+    """
+    
+Configuration class for the [`OnlineDPOTrainer`].
+
+This class includes only the parameters that are specific to Online DPO training. For a full list of training
+arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this
+class may differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    reward_model_path (`str` or `None`, *optional*, defaults to `None`):
+        Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both.
+    judge (`str` or `None`, *optional*, defaults to `None`):
+        Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both.
+    max_new_tokens (`int`, *optional*, defaults to `64`):
+        Maximum number of tokens to generate per completion.
+    max_length (`int`, *optional*, defaults to `256`):
+        Maximum total length of the sequence (prompt + completion) used to compute log probabilities. If the
+        sequence exceeds this limit, the leftmost tokens will be truncated to preserve as much of the completion as
+        possible.
+    temperature (`float`, *optional*, defaults to `0.9`):
+        Temperature for sampling. The higher the temperature, the more random the completions.
+    missing_eos_penalty (`float` or `None`, *optional*, defaults to `None`):
+        Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage to
+        generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive
+        value. This parameter only works when using `reward_funcs` and not when using `judge`.
+    beta (`float` or `list[float]`, *optional*, defaults to `0.1`):
+        Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+        reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+        the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is
+        selected for each new epoch and the last β is used for the rest of the epochs.
+    loss_type (`str`, *optional*, defaults to `"sigmoid"`):
+        Type of loss to use. Possible values are:
+
+            - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+            - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+
+    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        Number of processes to use for processing the dataset.
+    disable_dropout (`bool`, *optional*, defaults to `True`):
+        Whether to disable dropout in the model and reference model.
+
+    > Parameters that control generation
+
+    top_p (`float`, *optional*, defaults to `1.0`):
+        Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
+        `1.0` to consider all tokens.
+    top_k (`int` or `None`, *optional*, defaults to `None`):
+        Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is
+        disabled and all tokens are considered.
+    min_p (`float` or `None`, *optional*, defaults to `None`):
+        Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
+        value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range.
+    repetition_penalty (`float`, *optional*, defaults to `1.0`):
+        Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
+        Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat
+        tokens.
+    use_transformers_paged (`bool`, *optional*, defaults to `False`):
+        Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers`
+        paged implementation will be used for generation instead of the default padded implementation. This
+        parameter is only effective when `use_vllm` is set to `False`.
+    cache_implementation (`str` or `None`, *optional*, defaults to `None`):
+        Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
+    generation_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or `SamplingParams` (if
+        using vLLM) when sampling completions. This can be used to further customize the generation behavior, such
+        as setting `supress_tokens`, `num_beams`, etc. If it contains keys that conflict with the other generation
+        parameters (like `min_p`, `top_p`, etc.), they will override them.
+
+    > Parameters that control generation acceleration powered by vLLM
+
+    use_vllm (`bool`, *optional*, defaults to `False`):
+        Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation
+        instead of the default model.generate(). Requires `vllm` to be installed.
+    vllm_model_impl (`str`, *optional*, defaults to `"vllm"`):
+        Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use
+        the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model
+        implementation.
+    vllm_mode (`str`, *optional*, defaults to `"server"`):
+        Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or
+        `"colocate"`.
+
+        - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
+          server is running (start with `trl vllm-serve`).
+        - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a
+          separate server but may cause resource contention with training.
+    vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
+        Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.
+
+    > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)
+
+    vllm_server_base_url (`str` or `None`, *optional*, defaults to `None`):
+        Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and
+        `vllm_server_port` are ignored.
+    vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
+        Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+    vllm_server_port (`int`, *optional*, defaults to `8000`):
+        Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+    vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
+        Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
+        timeout, a `ConnectionError` is raised.
+
+    > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)
+
+    vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.55`):
+        Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to
+        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+        launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.
+    vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
+        Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to
+        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+        launching the vLLM server via the `--vllm_tensor_parallel_size` flag.
+
+    > Other parameters
+
+    ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+        This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+        improving generation speed. However, disabling this option allows training models that exceed the VRAM
+        capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
+        with vLLM generation.
+    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+        string.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        reward_model_path = None,
+        judge = None,
+        max_new_tokens = 64,
+        max_length = 512,
+        temperature = 0.9,
+        top_p = 1.0,
+        top_k = None,
+        min_p = None,
+        repetition_penalty = 1.0,
+        generation_kwargs = {},
+        use_transformers_paged = False,
+        cache_implementation = None,
+        missing_eos_penalty = None,
+        loss_type = 'sigmoid',
+        disable_dropout = True,
+        use_vllm = False,
+        vllm_model_impl = 'vllm',
+        vllm_guided_decoding_regex = None,
+        vllm_gpu_memory_utilization = 0.55,
+        vllm_mode = 'colocate',
+        vllm_server_base_url = None,
+        vllm_server_host = '0.0.0.0',
+        vllm_server_port = 8000,
+        vllm_server_timeout = 240.0,
+        vllm_tensor_parallel_size = 1,
+        ds3_gather_for_generation = True,
+        model_init_kwargs = None,
+        reward_weights = None,
+        dataset_num_proc = None,
+        gpu_memory_utilization = None,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        if temperature <= 0:
+            raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.')
+        elif temperature >= 10:
+            raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.')
+        
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            reward_model_path = reward_model_path,
+            judge = judge,
+            max_new_tokens = max_new_tokens,
+            max_length = max_length,
+            temperature = temperature,
+            top_p = top_p,
+            top_k = top_k,
+            min_p = min_p,
+            repetition_penalty = repetition_penalty,
+            generation_kwargs = generation_kwargs,
+            use_transformers_paged = use_transformers_paged,
+            cache_implementation = cache_implementation,
+            missing_eos_penalty = missing_eos_penalty,
+            loss_type = loss_type,
+            disable_dropout = disable_dropout,
+            use_vllm = use_vllm,
+            vllm_model_impl = vllm_model_impl,
+            vllm_guided_decoding_regex = vllm_guided_decoding_regex,
+            vllm_gpu_memory_utilization = vllm_gpu_memory_utilization,
+            vllm_mode = vllm_mode,
+            vllm_server_base_url = vllm_server_base_url,
+            vllm_server_host = vllm_server_host,
+            vllm_server_port = vllm_server_port,
+            vllm_server_timeout = vllm_server_timeout,
+            vllm_tensor_parallel_size = vllm_tensor_parallel_size,
+            ds3_gather_for_generation = ds3_gather_for_generation,
+            model_init_kwargs = model_init_kwargs,
+            reward_weights = reward_weights,
+            dataset_num_proc = dataset_num_proc,
+            gpu_memory_utilization = gpu_memory_utilization,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothOnlineDPOTrainer(Trainer):
+    r"""
+    Initialize OnlineDPOTrainer.
+
+    Args:
+        model (`Union[str, nn.Module, PreTrainedModel]`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        ref_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`):
+            The reference model to use for training. If None is specified, the reference model will be created from the
+            model.
+        judge (`BasePairwiseJudge`):
+            The judge to use for pairwise comparison of model completions.
+        reward_funcs (`Union[RewardFunc, list[RewardFunc]]`, *optional*, defaults to `None`):
+            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+            functions with the prompts and completions and sum the rewards. Can be either:
+
+            - A single reward function: Can be a string (path to model), a [`~transformers.PreTrainedModel`], or a
+              custom callable function.
+            - A list of reward functions: Must all be of compatible types.
+
+            Note: Only one of `judge`, or `reward_funcs` should be provided.
+        args (`OnlineDPOConfig`):
+            The online DPO config arguments to use for training.
+        data_collator (`transformers.DataCollator`):
+            The data collator to use for training. If None is specified, the default data collator
+            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
+            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+            - A single processing class: Used when `reward_funcs` contains only one reward function.
+            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+
+            If set to `None`, the tokenizer for each model-based reward function is automatically loaded using
+            [`~transformers.AutoTokenizer.from_pretrained`].
+        peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+
+    .. deprecated:: 0.22.0
+        The following parameters are deprecated and will be removed in a future version:
+
+        * `reward_model`: Use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.
+        * `reward_processing_class`: Use `reward_processing_classes` instead. For example, change
+          `reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.
+    """
+
+    _tag_names = ["trl", "online-dpo"]
+
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module, str],
+        ref_model: Union[PreTrainedModel, nn.Module, None] = None,
+        reward_funcs: Optional[Union[RewardFunc, list[RewardFunc]]] = None,
+        judge: Optional[BasePairwiseJudge] = None,
+        args: Optional[OnlineDPOConfig] = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None,
+        processing_class: Optional[Union[PreTrainedTokenizerBase, ProcessorMixin]] = None,
+        reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None,
+        peft_config: Optional["PeftConfig"] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        # Deprecated parameters
+        reward_model: Optional[Union[PreTrainedModel, nn.Module]] = None,
+        reward_processing_class: Optional[PreTrainedTokenizerBase] = None,
+    ) -> None:
+
+        if hasattr(model, 'vllm_engine') and hasattr(args, 'use_vllm'):
+            if (getattr(args, 'use_vllm', False) == False):
+                args.use_vllm = True
+        if ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, either omit the `ref_model` argument or pass `None`."
+            )
+
+        self.ref_model = ref_model
+
+        # Handle deprecated parameters for backward compatibility
+        if reward_model is not None:
+            warnings.warn(
+                "The `reward_model` parameter is deprecated and will be removed in version 0.25.0. "
+                "Please use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.",
+            )
+            # Convert old reward_model to new reward_funcs format
+            if reward_funcs is None:
+                reward_funcs = reward_model
+            else:
+                warnings.warn(
+                    "Both `reward_model` and `reward_funcs` are provided. Using `reward_funcs` and ignoring "
+                    "`reward_model`.",
+                )
+
+        if reward_processing_class is not None:
+            warnings.warn(
+                "The `reward_processing_class` parameter is deprecated and will be removed in version 0.25.0. "
+                "Please use `reward_processing_classes` instead. For example, change "
+                "`reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.",
+            )
+            # Convert old reward_processing_class to new reward_processing_classes format
+            if reward_processing_classes is None:
+                reward_processing_classes = reward_processing_class
+            else:
+                warnings.warn(
+                    "Both `reward_processing_class` and `reward_processing_classes` are provided. Using "
+                    "`reward_processing_classes` and ignoring `reward_processing_class`.",
+                )
+
+        # Validate reward configuration - must have exactly one of: judge, or reward_funcs
+        reward_configs = sum(x is not None for x in [judge, reward_funcs])
+        if reward_configs == 0:
+            raise ValueError("One of `judge` or `reward_funcs` must be provided.")
+        elif reward_configs > 1:
+            if judge is not None:
+                logger.warning(
+                    "Both `judge` and `reward_funcs` are provided. Using `judge` and ignoring `reward_funcs`.",
+                    UserWarning,
+                )
+                reward_funcs = None
+        self.judge = judge
+
+        # Handle reward_funcs
+        if reward_funcs is not None:
+            if not isinstance(reward_funcs, list):
+                reward_funcs = [reward_funcs]
+            self.reward_func_names = []
+
+            # Process reward functions [convert strings to models, collect names]
+            model_init_kwargs = args.model_init_kwargs or {}
+            for i, reward_func in enumerate(reward_funcs):
+                if isinstance(reward_func, str):
+                    # Load model from string path
+                    reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                        reward_func, num_labels=1, **model_init_kwargs
+                    )
+                if isinstance(reward_funcs[i], nn.Module):
+                    self.reward_func_names.append(reward_funcs[i].config._name_or_path.split("/")[-1])
+                else:
+                    self.reward_func_names.append(reward_funcs[i].__name__)
+            self.reward_funcs = reward_funcs
+
+            # Handle reward processing classes for reward_funcs
+            if reward_processing_classes is None:
+                reward_processing_classes = [None] * len(reward_funcs)
+            elif not isinstance(reward_processing_classes, list):
+                reward_processing_classes = [reward_processing_classes]
+            else:
+                if len(reward_processing_classes) != len(reward_funcs):
+                    raise ValueError(
+                        "The number of reward processing classes must match the number of reward functions."
+                    )
+
+            self.reward_processing_classes = []
+            for reward_processing_class_i, reward_func in zip(reward_processing_classes, reward_funcs):
+                if isinstance(reward_func, PreTrainedModel):
+                    if reward_processing_class_i is None:
+                        reward_processing_class_i = AutoTokenizer.from_pretrained(reward_func.config._name_or_path)
+                    if reward_processing_class_i.pad_token_id is None:
+                        reward_processing_class_i.pad_token = reward_processing_class_i.eos_token
+                    # Set pad token ID on reward model config
+                    reward_func.config.pad_token_id = reward_processing_class_i.pad_token_id
+                self.reward_processing_classes.append(reward_processing_class_i)
+        else:
+            self.reward_funcs = None
+            self.reward_func_names = []
+            self.reward_processing_classes = []
+
+        # Handle reward_weights
+        if reward_funcs is not None:
+            if args.reward_weights is not None:
+                if len(args.reward_weights) != len(self.reward_funcs):
+                    raise ValueError(
+                        f"Number of reward weights ({len(args.reward_weights)}) must match number of reward "
+                        f"functions ({len(self.reward_funcs)})"
+                    )
+                self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32)
+            else:
+                self.reward_weights = torch.ones(len(self.reward_funcs), dtype=torch.float32)
+        else:
+            self.reward_weights = None
+
+        if args.missing_eos_penalty is not None and reward_funcs is None and judge is None:
+            # Check if this is the old reward_model case
+            if reward_model is not None:
+                logger.warning(
+                    "The `missing_eos_penalty` parameter is deprecated when used with the deprecated `reward_model` parameter. "
+                    "Please use `reward_funcs` instead of `reward_model` to continue using this feature.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+            else:
+                raise ValueError("`missing_eos_penalty` is only supported when `reward_funcs` is provided.")
+
+        if args is None:
+            raise ValueError("`args` must be provided.")
+
+        # Check that the processing_class is provided
+        if processing_class is None:
+            raise ValueError("`processing_class` must be provided.")
+
+        model_init_kwargs = args.model_init_kwargs or {}
+        if isinstance(model, str):
+            model_id = model
+
+            # Handle dtype in model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+                pass
+            elif isinstance(dtype, str):
+                dtype = getattr(torch, dtype)
+                model_init_kwargs["dtype"] = dtype
+            else:
+                raise ValueError(
+                    "Invalid `dtype` passed to `OnlineDPOConfig`. Expected either 'auto' or a string "
+                    f"representing a `torch.dtype` (e.g., 'float32'), but got {dtype}."
+                )
+
+            model = AutoModelForCausalLM.from_pretrained(model_id, **model_init_kwargs)
+        else:
+            if args.model_init_kwargs is not None:
+                raise ValueError(
+                    "You passed `model_init_kwargs` to the `OnlineDPOConfig`, but your model is already instantiated. "
+                    "This argument can only be used when the `model` argument is a string."
+                )
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.is_vision_model = model.config.model_type in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.keys()
+
+        if False:
+            model = prepare_peft_model(model, peft_config, args)
+
+        # Enable gradient checkpointing if requested
+        if args.gradient_checkpointing:
+            model = self._enable_gradient_checkpointing(model, args)
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        # Handle the ref_model
+        # Usually, the user wants the ref model to be the initial version of the model. When using PEFT, it's easy to
+        # get the ref model, as it's just the model with a disabled adapter. When not using PEFT, we need to create
+        # the ref model from the model by copying it and disable the gradients and set it in evaluation mode.
+        if ref_model is None:  # No ref model provided, the most common case
+            if False:
+                self.ref_model = create_reference_model(model)  # copy, disable gradients, set eval mode
+            else:
+                self.ref_model = None  # we don't need a ref model here, we can just disable the adapter.
+        else:  # rare case, the user provided a ref model
+            self.ref_model = ref_model
+            self.ref_model.eval()
+
+        # Disable the gradient and set the reward model in eval mode
+        if reward_funcs is not None:
+            for reward_func in reward_funcs:
+                if isinstance(reward_func, PreTrainedModel):
+                    reward_func.eval()
+
+        self.max_length = args.max_length
+
+        self.stats = {
+            "objective/kl": [],
+            "objective/entropy": [],
+            "objective/non_score_reward": [],
+            "rewards/chosen": [],
+            "rewards/rejected": [],
+            "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
+            "val/contain_eos_token": [],
+            "beta": [],
+        }
+        if self.reward_funcs is not None:
+            self.stats["objective/rlhf_reward"] = []
+            self.stats["objective/scores_margin"] = []
+            self.stats["objective/scores"] = []
+
+        # Store generation parameters for later use
+        self.use_vllm = args.use_vllm
+        self.num_generations = 2  # Generate 2 completions per prompt for Online DPO
+        self.temperature = args.temperature
+        self.top_p = args.top_p
+        self.top_k = args.top_k
+        self.min_p = args.min_p
+        self.repetition_penalty = args.repetition_penalty
+        self.use_transformers_paged = args.use_transformers_paged
+        self.vllm_mode = args.vllm_mode if args.use_vllm else None
+        self.vllm_gpu_memory_utilization = args.vllm_gpu_memory_utilization
+        self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size
+        self.vllm_model_impl = args.vllm_model_impl
+
+        # Handle pad token for processors or tokenizers
+        if isinstance(processing_class, ProcessorMixin):
+            tokenizer = processing_class.tokenizer
+        elif isinstance(processing_class, PreTrainedTokenizerBase):
+            tokenizer = processing_class
+        else:
+            raise TypeError("The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`")
+
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        self.pad_token = tokenizer.pad_token
+        self.pad_token_id = tokenizer.pad_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+
+        # Vision tokens for VLM support
+        self.image_token_id = getattr(processing_class, "image_token_id", None)
+        self.vision_start_token_id = getattr(processing_class, "vision_start_token_id", None)
+        self.vision_end_token_id = getattr(processing_class, "vision_end_token_id", None)
+        # Get the image token string for token collapsing
+        self.image_token = None
+        if self.image_token_id is not None:
+            self.image_token = tokenizer.decode([self.image_token_id])
+
+        # Define the collator if not provided
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(pad_token_id=self.pad_token_id)
+
+        # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in Online DPO, the sampled data does not include
+        # the "input_ids" key. As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        self._beta = args.beta
+
+        # Set up generation configuration and vLLM after super[].__init__
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError(
+                    "vLLM is not available and `use_vllm` is set to True. Please install vLLM with "
+                    "`pip install vllm` to use it."
+                )
+
+            if self.vllm_mode == "server":
+                if self.accelerator.is_main_process:
+                    if args.vllm_server_base_url is not None:
+                        base_url = args.vllm_server_base_url
+                    else:
+                        base_url = f"http://{args.vllm_server_host}:{args.vllm_server_port}"
+                    self.vllm_client = VLLMClient(base_url=base_url, connection_timeout=args.vllm_server_timeout)
+                    self.vllm_client.init_communicator(device=torch.cuda.current_device())
+                else:
+                    self.vllm_client = None
+            elif self.vllm_mode == "colocate":
+                vllm_kwargs = {
+                    "model": model.name_or_path,
+                    "tensor_parallel_size": self.vllm_tensor_parallel_size,
+                    "gpu_memory_utilization": self.vllm_gpu_memory_utilization,
+                    "model_impl": self.vllm_model_impl,
+                    "max_num_seqs": self.args.per_device_train_batch_size * self.vllm_tensor_parallel_size,
+                    "max_model_len": args.max_length + args.max_new_tokens,
+                    "distributed_executor_backend": "external_launcher",
+                    "seed": self.accelerator.process_index // self.vllm_tensor_parallel_size,
+                    "max_num_batched_tokens": 4096,
+                }
+                os.environ["RANK"] = str(self.accelerator.process_index)
+                os.environ["LOCAL_RANK"] = str(self.accelerator.local_process_index)
+                os.environ["WORLD_SIZE"] = str(self.accelerator.num_processes)
+                os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "localhost")
+                os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "12345")
+
+                self.llm = model.vllm_engine
+            else:
+                raise ValueError(f"vllm_mode must be either 'server' or 'colocate', got '{self.vllm_mode}'.")
+            self.guided_decoding_regex = args.vllm_guided_decoding_regex
+            self._last_loaded_step = -1
+            generation_params = {
+                "n": 2,
+                "repetition_penalty": self.repetition_penalty,
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "top_k": -1 if self.top_k is None else self.top_k,
+                "min_p": 0.0 if self.min_p is None else self.min_p,
+                "max_tokens": args.max_new_tokens,
+                "detokenize": False,
+            }
+            if args.generation_kwargs is not None:
+                generation_params.update(args.generation_kwargs)
+            if self.guided_decoding_regex:
+                generation_params["guided_decoding"] = GuidedDecodingParams(regex=self.guided_decoding_regex)
+            self.generation_config = SamplingParams(**generation_params)
+            self.accelerator.wait_for_everyone()
+        else:
+            # Set up transformers generation config
+            generation_kwargs = {
+                "max_new_tokens": args.max_new_tokens,
+                "do_sample": True,
+                "pad_token_id": self.pad_token_id,
+                "bos_token_id": tokenizer.bos_token_id,
+                "eos_token_id": self.eos_token_id,
+                "temperature": self.temperature,
+                "top_k": self.top_k,
+                "top_p": self.top_p,
+                "repetition_penalty": self.repetition_penalty,
+                "use_cache": True if not self.args.gradient_checkpointing else False,
+            }
+            # Add min_p if supported
+            if self.min_p is not None:
+                generation_kwargs["min_p"] = self.min_p
+            if args.generation_kwargs is not None:
+                generation_kwargs.update(args.generation_kwargs)
+            if self.use_transformers_paged:
+                generation_kwargs["max_batch_tokens"] = 512
+                generation_kwargs["num_blocks"] = 1024
+                generation_kwargs["block_size"] = 128
+            # Remove None values
+            generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None}
+            self.generation_config = GenerationConfig(**generation_kwargs)
+
+        if self.is_deepspeed_enabled:
+            if self.ref_model is not None:
+                self.ref_model = prepare_deepspeed(
+                    self.ref_model, args.per_device_train_batch_size, args.fp16, args.bf16
+                )
+            # Prepare reward function models for DeepSpeed
+            if self.reward_funcs is not None:
+                for i, reward_func in enumerate(self.reward_funcs):
+                    if isinstance(reward_func, PreTrainedModel):
+                        self.reward_funcs[i] = prepare_deepspeed(reward_func, self.accelerator)
+        else:
+            if self.ref_model is not None:
+                self.ref_model = self.ref_model.to(self.accelerator.device)
+            # Prepare reward function models for FSDP/regular training
+            if self.reward_funcs is not None:
+                for i, reward_func in enumerate(self.reward_funcs):
+                    if isinstance(reward_func, PreTrainedModel):
+                        # Set device placement to True to make `prepare_model` move `reward_func` to device when using fsdp
+                        self.reward_funcs[i] = self.accelerator.prepare_model(
+                            reward_func, evaluation_mode=True, device_placement=True
+                        )
+
+    @property
+    def beta(self):
+        if isinstance(self._beta, list):
+            epoch = self.state.epoch
+            return self._beta[epoch] if epoch < len(self._beta) else self._beta[-1]
+        else:
+            return self._beta
+
+    @staticmethod
+    def tokenize_row(feature, is_encoder_decoder: bool, tokenizer: PreTrainedTokenizerBase) -> dict[str, Any]:
+        """Tokenize a single row from a DPO specific dataset."""
+        if not is_encoder_decoder:
+            batch = tokenizer(feature["prompt"], add_special_tokens=False)
+            # Add BOS token to head of prompt. Avoid adding if it's already there
+            if tokenizer.bos_token_id is not None:
+                prompt_len_input_ids = len(batch["input_ids"])
+                if prompt_len_input_ids == 0 or tokenizer.bos_token_id != batch["input_ids"][0]:
+                    batch["input_ids"] = [tokenizer.bos_token_id] + batch["input_ids"]
+                    batch["attention_mask"] = [1] + batch["attention_mask"]
+        else:
+            batch = tokenizer(feature["prompt"], add_special_tokens=True)
+        batch = {f"prompt_{key}": value for key, value in batch.items()}
+        return batch
+
+    # Same as Trainer.get_train_dataloader but skip the "remove_unused_columns".
+    @wraps(Trainer.get_train_dataloader)
+    def get_train_dataloader(self) -> DataLoader:
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator
+        dataloader_params = {
+            "batch_size": self._train_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_train_sampler()
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = seed_worker
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+
+        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
+
+    # Same as Trainer.get_eval_dataloader but skip the "remove_unused_columns".
+    @wraps(Trainer.get_eval_dataloader)
+    def get_eval_dataloader(self, eval_dataset: Optional[Union[str, Dataset]] = None) -> DataLoader:
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+
+        # If we have persistent workers, don't do a fork bomb especially as eval datasets
+        # don't change during training
+        dataloader_key = eval_dataset if isinstance(eval_dataset, str) else "eval"
+        if (
+            hasattr(self, "_eval_dataloaders")
+            and dataloader_key in self._eval_dataloaders
+            and self.args.dataloader_persistent_workers
+        ):
+            return self.accelerator.prepare(self._eval_dataloaders[dataloader_key])
+
+        eval_dataset = (
+            self.eval_dataset[eval_dataset]
+            if isinstance(eval_dataset, str)
+            else eval_dataset
+            if eval_dataset is not None
+            else self.eval_dataset
+        )
+        data_collator = self.data_collator
+
+        dataloader_params = {
+            "batch_size": self.args.eval_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+
+        if not isinstance(eval_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset)
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+
+        # accelerator.free_memory() will destroy the references, so
+        # we need to store the non-prepared version
+        eval_dataloader = DataLoader(eval_dataset, **dataloader_params)
+        if self.args.dataloader_persistent_workers:
+            if hasattr(self, "_eval_dataloaders"):
+                self._eval_dataloaders[dataloader_key] = eval_dataloader
+            else:
+                self._eval_dataloaders = {dataloader_key: eval_dataloader}
+
+        return self.accelerator.prepare(eval_dataloader)
+
+    def _enable_gradient_checkpointing(self, model: PreTrainedModel, args: OnlineDPOConfig) -> PreTrainedModel:
+        """Enables gradient checkpointing for the model."""
+        # Ensure use_cache is disabled
+        model.config.use_cache = False
+
+        # Enable gradient checkpointing on the base model for PEFT
+        if is_peft_model(model):
+            model.base_model.gradient_checkpointing_enable()
+        # Enable gradient checkpointing for non-PEFT models
+        else:
+            model.gradient_checkpointing_enable()
+
+        gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs or {}
+        use_reentrant = (
+            "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"]
+        )
+
+        if use_reentrant:
+            model.enable_input_require_grads()
+
+        return model
+
+    def _generate_vllm(self, prompts, images=None):
+        eos_token_id = self.eos_token_id
+        pad_token_id = self.pad_token_id
+
+        # Generate completion_ids and prompt_ids based on mode
+        if self.vllm_mode == "server":
+            completion_ids, prompt_ids = self._generate_vllm_server(prompts, images)
+        elif self.vllm_mode == "colocate":
+            completion_ids, prompt_ids = self._generate_vllm_colocate(prompts, images)
+
+        # Shared padding, masking, and tensor conversion logic
+        max_prompt_length = max(len(ids) for ids in prompt_ids)
+        prompt_mask = [[0] * (max_prompt_length - len(ids)) + [1] * len(ids) for ids in prompt_ids]
+        prompt_ids = [[pad_token_id] * (max_prompt_length - len(ids)) + ids for ids in prompt_ids]
+        max_tokens = self.generation_config.max_tokens
+        completion_mask = [[1] * len(ids) + [0] * (max_tokens - len(ids)) for ids in completion_ids]
+        completion_ids = [
+            ids + [eos_token_id] if ids[-1] != eos_token_id and len(ids) < max_tokens else ids
+            for ids in completion_ids
+        ]
+        completion_ids = [ids + [pad_token_id] * (max_tokens - len(ids)) for ids in completion_ids]
+
+        # Convert to tensors
+        prompt_ids = torch.tensor(prompt_ids, device=self.accelerator.device)
+        prompt_mask = torch.tensor(prompt_mask, device=self.accelerator.device)
+        completion_ids = torch.tensor(completion_ids, device=self.accelerator.device)
+        completion_mask = torch.tensor(completion_mask, device=self.accelerator.device)
+
+        return prompt_ids, prompt_mask, completion_ids, completion_mask
+
+    def _generate_vllm_server(self, prompts, images=None):
+        """Generate completions using vLLM server mode"""
+        has_images = images is not None
+
+        # Update vLLM server weights if needed
+        if hasattr(self, "_last_loaded_step") and self.state.global_step != self._last_loaded_step:
+            self._move_model_to_vllm()
+            self._last_loaded_step = self.state.global_step
+        elif not hasattr(self, "_last_loaded_step"):
+            self._move_model_to_vllm()
+            self._last_loaded_step = self.state.global_step
+
+        # Apply chat template if conversational
+        if is_conversational({"prompt": prompts[0]}):
+            prompts_text = [apply_chat_template({"prompt": p}, self.processing_class)["prompt"] for p in prompts]
+        else:
+            prompts_text = prompts
+        # Gather all prompts to main process
+        all_prompts = gather_object(prompts_text)
+        if has_images:
+            all_images = gather_object(images)
+
+        if self.accelerator.is_main_process:
+            # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
+            # num_generations outputs for each one. This is faster than generating outputs for each duplicate
+            # prompt individually.
+            ordered_set_of_prompts = all_prompts[:: self.num_generations]
+            if has_images:
+                ordered_set_of_images = all_images[:: self.num_generations]
+            else:
+                ordered_set_of_images = None
+            completion_ids = self.vllm_client.generate(
+                prompts=ordered_set_of_prompts,
+                images=ordered_set_of_images,
+                n=self.num_generations,
+                repetition_penalty=self.repetition_penalty,
+                temperature=self.temperature,
+                top_p=self.top_p,
+                top_k=-1 if self.top_k is None else self.top_k,
+                min_p=0.0 if self.min_p is None else self.min_p,
+                max_tokens=self.generation_config.max_tokens,
+                guided_decoding_regex=self.guided_decoding_regex if hasattr(self, "guided_decoding_regex") else None,
+                generation_kwargs=self.args.generation_kwargs,
+            )
+            # Flatten: each prompt generates 2 completions
+            completion_ids = [[comp_id] for prompt_completions in completion_ids for comp_id in prompt_completions]
+        else:
+            completion_ids = [None] * (len(all_prompts) * 2)
+
+        # Broadcast completions to all processes
+        completion_ids = broadcast_object_list(completion_ids, from_process=0)
+
+        # Each process takes its slice
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts) * 2,
+            (self.accelerator.process_index + 1) * len(prompts) * 2,
+        )
+        completion_ids = completion_ids[process_slice]
+
+        # Create prompt_ids by tokenizing locally
+        prompt_inputs = self.processing_class(
+            text=prompts_text,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left",
+            add_special_tokens=False,
+        )
+        prompt_ids = []
+        for prompt_tokens in prompt_inputs["input_ids"]:
+            prompt_ids.extend([prompt_tokens.tolist(), prompt_tokens.tolist()])  # 2 copies for 2 completions
+        return completion_ids, prompt_ids
+
+    def _generate_vllm_colocate(self, prompts, images=None):
+        """Generate completions using vLLM colocate mode"""
+        # Update model weights if needed
+        self._move_model_to_vllm()
+
+        # Apply chat template if conversational
+        if is_conversational({"prompt": prompts[0]}):
+            prompts_text = [apply_chat_template({"prompt": p}, self.processing_class)["prompt"] for p in prompts]
+        else:
+            prompts_text = prompts
+
+        # Prepare vLLM inputs with images if available
+        if images is not None:
+            vllm_inputs = []
+            for prompt, image in zip(prompts_text, images):
+                if image is not None:
+                    vllm_inputs.append({"prompt": prompt, "multi_modal_data": {"image": image}})
+                else:
+                    vllm_inputs.append(prompt)
+        else:
+            vllm_inputs = prompts_text
+
+        outputs = self.llm.generate(vllm_inputs, self.generation_config, use_tqdm=False, lora_request = self.model.load_lora('online_dpo_trainer_lora_model', load_tensors = True))
+
+        completion_ids = [list(output.outputs[i].token_ids) for i in range(2) for output in outputs]
+        prompt_ids = [list(output.prompt_token_ids) for _ in range(2) for output in outputs]
+
+        return completion_ids, prompt_ids
+
+    def _move_model_to_vllm(self):
+        """Synchronize model weights to vLLM server with support for PEFT, DeepSpeed, and FSDP"""
+        # For DeepSpeed ZeRO-3 and FSDP, we need to gather all parameters before operations
+        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
+        zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
+        if zero_stage_3:
+            import deepspeed
+
+            gather_if_zero3 = deepspeed.zero.GatheredParameters
+        else:
+            gather_if_zero3 = nullcontext
+
+        if is_peft_model(self.model):
+            # With PEFT and FSDP/DeepSpeed ZeRO Stage 3, we must gather the full model at once before merging, as
+            # merging adapters in a sharded manner is not supported.
+            # TODO: does this work with FSDP?
+            with gather_if_zero3(list(self.model.parameters())):
+                self.model.merge_adapter()
+
+                # Update vLLM weights while parameters are gathered
+                if self.is_fsdp_enabled:  # note if using FSDP, gather_if_zero3 is nullcontext
+                    # Update vLLM weights while parameters are gathered
+                    # For PEFT with FSDP we need to use the memory efficient post-order traversal
+                    fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None)
+                    fsdp_version = getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1
+                    if fsdp_version == 1:
+                        # use memory-efficient post-order traversal for FSDP
+                        self._sync_fsdp1_params_to_vllm(self.model)
+                    elif fsdp_version == 2:
+                        self._sync_fsdp2_params_to_vllm(self.model)
+                else:
+                    # DeepSpeed ZeRO-3 with PEFT
+                    for name, param in self.model.named_parameters():
+                        # When using PEFT, we need to recover the original parameter name and discard some parameters
+                        name = name.removeprefix("base_model.model.").replace(".base_layer", "")
+                        if self.model.prefix in name:
+                            continue
+                        # When module to save, remove its prefix and discard the original module
+                        if "original_module" in name:
+                            continue
+                        name = self._fix_param_name_to_vllm(name, extra_prefixes=["modules_to_save.default."])
+
+                        if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                            self.vllm_client.update_named_param(name, param.data)
+                        elif self.vllm_mode == "colocate":
+
+                            pass
+
+                            pass
+                # Unmerge adapters while parameters are still gathered
+                self.model.unmerge_adapter()
+                # Parameters will automatically be repartitioned when exiting the context
+        else:
+            # For non-PEFT models, simply gather (if needed) and update each parameter individually.
+            if self.is_fsdp_enabled:
+                fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None)
+                fsdp_version = getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1
+                if fsdp_version == 1:
+                    self._sync_fsdp1_params_to_vllm(self.model)  # use memory-efficient post-order traversal for FSDP
+                elif fsdp_version == 2:
+                    self._sync_fsdp2_params_to_vllm(self.model)
+            else:
+                for name, param in self.model.named_parameters():
+                    name = self._fix_param_name_to_vllm(name)
+                    with gather_if_zero3([param]):
+                        if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                            self.vllm_client.update_named_param(name, param.data)
+                        elif self.vllm_mode == "colocate":
+
+                            pass
+
+                            pass
+
+        # Reset cache on vLLM
+        if self.vllm_mode == "server" and self.accelerator.is_main_process:
+            self.vllm_client.reset_prefix_cache()
+        elif self.vllm_mode == "colocate":
+            self.llm.reset_prefix_cache()
+
+    def _sync_fsdp1_params_to_vllm(self, module: nn.Module, prefix: str = "", visited=None):
+        """Memory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM."""
+        # For FSDP1, we need to recurse into children and also use summon_full_params
+        if visited is None:
+            visited = set()
+        for child_name, child_module in module.named_children():
+            child_prefix = f"{prefix}.{child_name}" if prefix else child_name
+            self._sync_fsdp1_params_to_vllm(
+                child_module, prefix=child_prefix, visited=visited
+            )  # recurse into the child
+
+        if isinstance(module, FSDP):
+            with FSDP.summon_full_params(module, recurse=False, writeback=False):
+                for param_name, param in module.named_parameters():
+                    full_name = f"{prefix}.{param_name}" if prefix else param_name
+                    full_name = self._fix_param_name_to_vllm(full_name, extra_prefixes=["_fsdp_wrapped_module."])
+
+                    if full_name in visited:
+                        continue  # skip FSDP subtrees already traversed
+                    visited.add(full_name)
+
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(full_name, param.data)
+                    elif self.vllm_mode == "colocate":
+
+                        pass
+
+                        pass
+
+    def _sync_fsdp2_params_to_vllm(self, module: nn.Module):
+        # For FSDP2, module already covers all parameters, so no need for recursion
+        for name, param in module.items():
+            if param.is_cpu:
+                param = param.to(torch.device("cuda"))
+            param = param.full_tensor()
+
+            if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                self.vllm_client.update_named_param(name, param)
+            elif self.vllm_mode == "colocate":
+
+                pass
+
+                pass
+
+    def _fix_param_name_to_vllm(self, name, extra_prefixes: Optional[list[str]] = None):
+        """Clean parameter names for vLLM compatibility"""
+        extra_prefixes = extra_prefixes or []
+        prefixes = ["_checkpoint_wrapped_module."] + extra_prefixes
+        for prefix in prefixes:
+            name = name.replace(prefix, "")
+        return name
+
+    def process_vision_row(
+        self, features: dict[str, Union[list, torch.Tensor]], processing_class=None
+    ) -> dict[str, list[int]]:
+        """
+        Process a vision row for VLM models (adapted from DPO trainer)
+        """
+        processor = processing_class or self.processing_class
+        processed_features = processor(images=[features["image"]], text=features["prompt"], add_special_tokens=False)
+
+        prompt_input_ids = processed_features["input_ids"][0]
+
+        # Create the output dict with required fields
+        output = {
+            "prompt_input_ids": prompt_input_ids,
+            "prompt_attention_mask": processed_features["attention_mask"][0],
+        }
+
+        # Add vision-specific fields
+        if "pixel_values" in processed_features:
+            output["pixel_values"] = processed_features["pixel_values"][0]
+        if "pixel_attention_mask" in processed_features:
+            output["pixel_attention_mask"] = processed_features["pixel_attention_mask"][0]
+        if "image_sizes" in processed_features:
+            output["image_sizes"] = processed_features["image_sizes"][0]
+
+        return output
+
+    def _generate(self, model, prompts, images=None):
+        """Generate completions using the model"""
+        device = next(model.parameters()).device
+        eos_token_id = self.eos_token_id
+        pad_token_id = self.pad_token_id
+
+        # Apply chat template and tokenize the input
+        inputs = [{"prompt": prompt} for prompt in prompts]
+
+        # Add images if provided (VLM support)
+        if images is not None:
+            for i, image in enumerate(images):
+                inputs[i]["image"] = image
+
+        # Apply chat template to get text prompts
+        prompts_text = [maybe_apply_chat_template(x, self.processing_class)["prompt"] for x in inputs]
+
+        # Handle image token collapsing/removal
+        # The chat template sometimes inserts a single image token into the prompt text. However, when this text is
+        # later tokenized, the single image token string is expanded into multiple image token IDs, depending on the
+        # image size. We need to handle this properly.
+        if self.image_token is not None and images is not None:
+            escaped_img_token = re.escape(self.image_token)
+            # Search for the image token in the chat template
+            if hasattr(self.processing_class, "chat_template") and self.processing_class.chat_template:
+                if re.search(escaped_img_token, self.processing_class.chat_template):
+                    # Collapse repeated image tokens back into a single token
+                    prompts_text = [
+                        re.sub(rf"({escaped_img_token})+", self.image_token, text) for text in prompts_text
+                    ]
+                else:
+                    # If the chat template doesn't use the image token, remove all instances
+                    if self.vision_end_token_id is not None:
+                        escaped_eoi_token = re.escape(
+                            self.processing_class.tokenizer.decode([self.vision_end_token_id])
+                        )
+                        prompts_text = [
+                            re.sub(rf"({escaped_img_token})+{escaped_eoi_token}", "", text) for text in prompts_text
+                        ]
+                    else:
+                        # If vision_end_token_id is None, just remove the image tokens
+                        prompts_text = [re.sub(rf"({escaped_img_token})+", "", text) for text in prompts_text]
+
+        # Prepare kwargs for processing class
+        kwargs = {}
+        if images is not None:
+            kwargs = {"images": [[img] for img in images]}
+
+        # Process inputs using the processing class (handles both VLM and LLM)
+        prompt_inputs = self.processing_class(
+            text=prompts_text,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left",
+            add_special_tokens=False,
+            **kwargs,
+        )
+
+        prompt_inputs = {k: v.to(device) for k, v in prompt_inputs.items()}
+        # Convert vision inputs to model's dtype for proper computation
+        if "pixel_values" in prompt_inputs:
+            # Handle DataParallel wrapped models
+            model_dtype = getattr(model, "dtype", None)
+            if model_dtype is None and hasattr(model, "module"):
+                model_dtype = model.module.dtype
+            if model_dtype is not None:
+                prompt_inputs["pixel_values"] = prompt_inputs["pixel_values"].to(model_dtype)
+
+        # Sample 2 completions per prompt of size `max_new_tokens` from the model
+        prompt_ids = prompt_inputs["input_ids"].repeat(2, 1)
+        prompt_mask = prompt_inputs["attention_mask"].repeat(2, 1)
+
+        # Prepare vision inputs if available
+        vision_generation_kwargs = {}
+        if self.is_vision_model and images is not None:
+            if "pixel_values" in prompt_inputs:
+                vision_generation_kwargs["pixel_values"] = prompt_inputs["pixel_values"].repeat(2, 1, 1, 1)
+            if "pixel_attention_mask" in prompt_inputs:
+                vision_generation_kwargs["pixel_attention_mask"] = prompt_inputs["pixel_attention_mask"].repeat(2, 1)
+            if "image_sizes" in prompt_inputs:
+                vision_generation_kwargs["image_sizes"] = prompt_inputs["image_sizes"].repeat(2, 1)
+            if "image_grid_thw" in prompt_inputs:
+                vision_generation_kwargs["image_grid_thw"] = prompt_inputs["image_grid_thw"].repeat(2, 1)
+
+        if self.use_transformers_paged:
+            previous_attn = self.model_wrapped.config._attn_implementation
+
+            if is_flash_attn_2_available():
+                self.model_wrapped.config._attn_implementation = "paged_attention"
+            else:
+                self.model_wrapped.config._attn_implementation = "sdpa_paged"
+            with (
+                profiling_context(self, "transformers.generate_batch"),
+                unwrap_model_for_generation(
+                    model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False) if self.is_fsdp_enabled else nullcontext(),
+            ):
+                # Cast to the appropriate dtype based on training configuration
+                if self.args.bf16:
+                    unwrapped_model.to(torch.bfloat16)
+                elif self.args.fp16:
+                    unwrapped_model.to(torch.float16)
+                with torch.inference_mode():
+                    all_outputs = unwrapped_model.generate_batch(
+                        prompt_ids.tolist(),
+                        generation_config=self.generation_config,
+                        progress_bar=False,
+                    )
+            completion_ids = [output.generated_tokens for output in all_outputs.values()]
+            completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
+            completion_ids = pad(completion_ids, padding_value=self.pad_token_id, padding_side="right")
+            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+            # Restore the original attention implementation, training mode
+            self.model_wrapped.config._attn_implementation = previous_attn
+
+            # Extract completion_ids and create completion_mask
+            prompt_length = prompt_ids.size(1)
+            completion_ids = prompt_completion_ids[:, prompt_length:]
+            completion_ids, completion_mask = truncate_right(completion_ids, eos_token_id, pad_token_id)
+
+            return prompt_ids, prompt_mask, completion_ids, completion_mask
+        else:
+            # Regular generation path
+            with (
+                profiling_context(self, "transformers.generate"),
+                unwrap_model_for_generation(
+                    model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False) if self.is_fsdp_enabled else nullcontext(),
+            ):
+                # Setup cache implementation if specified
+                if self.args.cache_implementation is not None:
+                    unwrapped_model.generation_config.cache_implementation = self.args.cache_implementation
+
+                # Standard generation
+                output = unwrapped_model.generate(
+                    input_ids=prompt_ids,
+                    attention_mask=prompt_mask,
+                    generation_config=self.generation_config,
+                    **vision_generation_kwargs,
+                )
+
+            completion_ids = output[:, prompt_ids.size(1) :]
+            completion_ids, completion_mask = truncate_right(completion_ids, eos_token_id, pad_token_id)
+
+            return prompt_ids, prompt_mask, completion_ids, completion_mask
+
+    def _calculate_rewards_from_functions(self, prompts, completions, completion_ids_list, **reward_kwargs):
+        """
+        Calculate rewards using reward functions
+        """
+        device = self.accelerator.device
+        rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device)
+
+        # Add trainer state to reward kwargs for dynamic reward shaping
+        reward_kwargs["trainer_state"] = self.state
+
+        for i, (reward_func, reward_processing_class) in enumerate(
+            zip(self.reward_funcs, self.reward_processing_classes)
+        ):
+            if isinstance(reward_func, nn.Module):  # Model-based reward function
+                # Handle conversational vs text input
+                if is_conversational({"prompt": prompts[0]}):
+                    messages = [{"messages": p + c} for p, c in zip(prompts, completions)]
+                    texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages]
+                else:
+                    texts = [p + c for p, c in zip(prompts, completions)]
+
+                # Tokenize and get reward scores
+                reward_inputs = reward_processing_class(
+                    text=texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False
+                )
+                reward_inputs = {k: v.to(device) for k, v in reward_inputs.items()}
+
+                with torch.inference_mode():
+                    rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0]  # Shape (B*G,)
+            else:
+                # Custom reward function
+                output_reward_func = reward_func(
+                    prompts=prompts, completions=completions, completion_ids=completion_ids_list, **reward_kwargs
+                )
+                # Convert None values to NaN
+                output_reward_func = [reward if reward is not None else torch.nan for reward in output_reward_func]
+                rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device)
+
+        # Weight and sum across all reward functions
+        if self.reward_weights is not None:
+            total_rewards = (rewards_per_func * self.reward_weights.to(device).unsqueeze(0)).nansum(dim=1)
+        else:
+            total_rewards = rewards_per_func.nansum(dim=1)
+
+        return total_rewards
+
+    def _forward(self, model, prompt_ids, prompt_mask, completion_ids, completion_mask, vision_inputs=None):
+        # Get the number of tokens to truncate from prompt
+        num_tokens_to_truncate = max(prompt_ids.size(1) + completion_ids.size(1) - self.max_length, 0)
+
+        # Truncate left to avoid oom
+        prompt_ids = prompt_ids[:, num_tokens_to_truncate:]
+        prompt_mask = prompt_mask[:, num_tokens_to_truncate:]
+
+        # Concat the prompt and completion
+        prompt_completion_ids = torch.cat((prompt_ids, completion_ids), dim=1)
+        prompt_completion_mask = torch.cat((prompt_mask, completion_mask), dim=1)
+
+        # Prepare model kwargs with vision inputs if available
+        model_kwargs = {"attention_mask": prompt_completion_mask}
+        if vision_inputs is not None:
+            if "pixel_values" in vision_inputs:
+                model_kwargs["pixel_values"] = vision_inputs["pixel_values"]
+            if "pixel_attention_mask" in vision_inputs:
+                model_kwargs["pixel_attention_mask"] = vision_inputs["pixel_attention_mask"]
+            if "image_sizes" in vision_inputs:
+                model_kwargs["image_sizes"] = vision_inputs["image_sizes"]
+            if "image_grid_thw" in vision_inputs:
+                model_kwargs["image_grid_thw"] = vision_inputs["image_grid_thw"]
+
+        # Get the logprobs of the completions from the model
+        output = model(prompt_completion_ids, **model_kwargs)
+
+        # There is 1 offset, because the model predict the next token
+        prompt_len = prompt_ids.size(1)
+        start_idx = prompt_len - 1 if prompt_len > 0 else 0
+        logits = output.logits[:, start_idx:-1]
+
+        # Take the completion tokens logprob
+        logprobs = torch.take_along_dim(logits.log_softmax(dim=-1), completion_ids.unsqueeze(-1), dim=2).squeeze(-1)
+        return logprobs
+
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+    ) -> torch.Tensor:
+        model.train()
+
+        prompts = inputs["prompt"]
+        batch_size = len(prompts)
+
+        # Handle images for VLM support
+        has_images = "image" in inputs
+        images = None
+        if has_images:
+            images = inputs["image"]
+            # Convert conversational prompts to include image tokens
+            for prompt in prompts:
+                if isinstance(prompt, list):
+                    for message in prompt:
+                        if not isinstance(message, dict):
+                            continue
+                        content = message.get("content")
+                        role = message.get("role")
+                        if isinstance(content, str):
+                            if role == "user":
+                                message["content"] = [{"type": "image"}, {"type": "text", "text": content}]
+                            elif role == "system":
+                                message["content"] = [{"type": "text", "text": content}]
+
+        if self.args.use_vllm:
+            prompt_ids, prompt_mask, completion_ids, completion_mask = self._generate_vllm(prompts, images)
+        else:
+            prompt_ids, prompt_mask, completion_ids, completion_mask = self._generate(model, prompts, images)
+
+        contain_eos_token = torch.any(completion_ids == self.eos_token_id, dim=-1)
+
+        # Extract vision inputs if available for VLM support
+        vision_inputs = None
+        if has_images and self.is_vision_model and not self.args.use_vllm:
+            # For vision models with transformers generation, we need to prepare vision inputs
+            # Process the images to get vision inputs that can be passed through the forward pass
+            vision_inputs = {}
+            kwargs = {"images": [[img] for img in images]}
+            processed = self.processing_class(
+                text=[""] * len(images),  # Dummy text for vision processing
+                return_tensors="pt",
+                **kwargs,
+            )
+            # Handle DataParallel wrapped models
+            model_device = getattr(model, "device", None)
+            model_dtype = getattr(model, "dtype", None)
+            if model_device is None and hasattr(model, "module"):
+                model_device = model.module.device
+                model_dtype = model.module.dtype
+            # Move vision tensors to device and convert to model dtype
+            # Need to duplicate for 2 completions per prompt
+            if "pixel_values" in processed:
+                vision_inputs["pixel_values"] = (
+                    processed["pixel_values"].to(model_device, dtype=model_dtype).repeat(2, 1, 1, 1)
+                )
+            if "pixel_attention_mask" in processed:
+                vision_inputs["pixel_attention_mask"] = processed["pixel_attention_mask"].to(model_device).repeat(2, 1)
+            if "image_sizes" in processed:
+                vision_inputs["image_sizes"] = processed["image_sizes"].to(model_device).repeat(2, 1)
+            if "image_grid_thw" in processed:
+                vision_inputs["image_grid_thw"] = processed["image_grid_thw"].to(model_device).repeat(2, 1)
+
+        logprobs = self._forward(model, prompt_ids, prompt_mask, completion_ids, completion_mask, vision_inputs)
+        with torch.no_grad():
+            if self.ref_model is not None:
+                ref_logprobs = self._forward(
+                    self.ref_model, prompt_ids, prompt_mask, completion_ids, completion_mask, vision_inputs
+                )
+            else:  # peft case: we just need to disable the adapter
+                with self.model.disable_adapter():
+                    ref_logprobs = self._forward(
+                        self.model, prompt_ids, prompt_mask, completion_ids, completion_mask, vision_inputs
+                    )
+
+        # Decode the completions, and format them if the input is conversational
+        device = logprobs.device
+        completions = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True)
+        if is_conversational({"prompt": prompts[0]}):
+            completions = [[{"role": "assistant", "content": completion}] for completion in completions]
+
+        # Get the reward from reward functions, judge, or deprecated reward_model
+        if self.reward_funcs is not None:
+            # First create completion_ids_list for custom reward functions
+            completion_ids_list = [completion_ids[i].tolist() for i in range(completion_ids.shape[0])]
+
+            # Extract additional fields from inputs for reward functions
+            reward_kwargs = {}
+            keys = [key for key in inputs if key not in ["prompt"]]
+            for key in keys:
+                if isinstance(inputs[key], (list, tuple)):
+                    # Repeat input fields to match number of completions (2 per prompt)
+                    reward_kwargs[key] = inputs[key] * 2
+                else:
+                    reward_kwargs[key] = inputs[key]
+
+            # Calculate rewards using reward functions
+            rewards = self._calculate_rewards_from_functions(
+                prompts=2 * prompts, completions=completions, completion_ids_list=completion_ids_list, **reward_kwargs
+            )
+
+            # Apply missing EOS penalty if configured
+            if self.args.missing_eos_penalty is not None:
+                rewards[~contain_eos_token] -= self.args.missing_eos_penalty
+
+            # Split rewards into chosen/rejected pairs
+            first_half, second_half = rewards.split(batch_size)
+            mask = first_half >= second_half
+        elif self.judge is not None:
+            # Once formatted, conversational data may contain special tokens (such as <|im_start|>) that are not
+            # directly understandable by the judge and could alter its judgment. To avoid this and make the judge
+            # independent of the model's chat template, we use the raw conversation data, and apply our own chat
+            # template to it.
+            if is_conversational({"prompt": prompts[0]}):
+                environment = jinja2.Environment()
+                template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+                prompts = [template.render(messages=prompt) for prompt in prompts]
+                completions = [template.render(messages=completion) for completion in completions]
+
+            ranks_of_first_completion = self.judge.judge(
+                prompts, list(zip(completions[:batch_size], completions[batch_size:]))
+            )
+
+            # convert ranks to a True/False mask:
+            # when rank == 0, it means the first completion is the best
+            # when rank == 1, it means the second completion is the best
+            mask = torch.tensor([rank == 0 for rank in ranks_of_first_completion], device=device)
+
+        batch_range = torch.arange(batch_size, device=device)
+        chosen_indices = batch_range + (~mask * batch_size)
+        rejected_indices = batch_range + (mask * batch_size)
+
+        # Build tensor so that the first half is the chosen examples and the second half the rejected examples
+        cr_indices = torch.cat((chosen_indices, rejected_indices), dim=0)  # cr = chosen and rejected
+        cr_logprobs = logprobs[cr_indices]
+        cr_ref_logprobs = ref_logprobs[cr_indices]
+
+        # mask out the padding tokens
+        padding_mask = ~completion_mask.bool()
+        cr_padding_mask = padding_mask[cr_indices]
+
+        cr_logprobs_sum = (cr_logprobs * ~cr_padding_mask).sum(1)
+        cr_ref_logprobs_sum = (cr_ref_logprobs * ~cr_padding_mask).sum(1)
+
+        # Split the chosen and rejected examples
+        chosen_logprobs_sum, rejected_logprobs_sum = torch.split(cr_logprobs_sum, batch_size)
+        chosen_ref_logprobs_sum, rejected_ref_logprobs_sum = torch.split(cr_ref_logprobs_sum, batch_size)
+        pi_logratios = chosen_logprobs_sum - rejected_logprobs_sum
+        ref_logratios = chosen_ref_logprobs_sum - rejected_ref_logprobs_sum
+
+        logits = pi_logratios - ref_logratios
+
+        if self.args.loss_type == "sigmoid":
+            losses = -F.logsigmoid(self.beta * logits)
+        elif self.args.loss_type == "ipo":
+            losses = (logits - 1 / (2 * self.beta)) ** 2
+        else:
+            raise NotImplementedError(f"invalid loss type {self.loss_type}")
+
+        loss = losses.mean()
+
+        # Log everything
+        if self.reward_funcs is not None:
+            # When using reward_funcs, we have rewards instead of scores
+            scores_margin = rewards[chosen_indices] - rewards[rejected_indices]
+            self.stats["objective/scores_margin"].append(
+                self.accelerator.gather_for_metrics(scores_margin.mean()).mean().item()
+            )
+            self.stats["objective/scores"].append(self.accelerator.gather_for_metrics(rewards.mean()).mean().item())
+        self.stats["val/contain_eos_token"].append(contain_eos_token.float().mean().item())
+        self.stats["logps/chosen"].append(self.accelerator.gather_for_metrics(chosen_logprobs_sum).mean().item())
+        self.stats["logps/rejected"].append(self.accelerator.gather_for_metrics(rejected_logprobs_sum).mean().item())
+
+        kl = logprobs - ref_logprobs
+        mean_kl = kl.sum(1).mean()
+        self.stats["objective/kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
+        non_score_reward = (-self.beta * kl).sum(1)
+        mean_non_score_reward = non_score_reward.mean()
+        self.stats["objective/non_score_reward"].append(
+            self.accelerator.gather_for_metrics(mean_non_score_reward).mean().item()
+        )
+        if self.reward_funcs is not None:
+            # Calculate RLHF reward by combining rewards with non_score_reward
+            rlhf_reward = rewards + non_score_reward
+            self.stats["objective/rlhf_reward"].append(self.accelerator.gather_for_metrics(rlhf_reward).mean().item())
+
+        mean_entropy = -logprobs.sum(1).mean()
+        self.stats["objective/entropy"].append(self.accelerator.gather_for_metrics(mean_entropy).mean().item())
+        chosen_rewards = self.beta * (chosen_logprobs_sum - chosen_ref_logprobs_sum)
+        gathered_chosen_rewards = self.accelerator.gather_for_metrics(chosen_rewards)
+        self.stats["rewards/chosen"].append(gathered_chosen_rewards.mean().item())
+        rejected_rewards = self.beta * (rejected_logprobs_sum - rejected_ref_logprobs_sum)
+        gathered_rejected_rewards = self.accelerator.gather_for_metrics(rejected_rewards)
+        self.stats["rewards/rejected"].append(gathered_rejected_rewards.mean().item())
+        margin = gathered_chosen_rewards - gathered_rejected_rewards
+        self.stats["rewards/margins"].append(margin.mean().item())
+        accuracy = margin > 0
+        self.stats["rewards/accuracies"].append(accuracy.float().mean().item())
+        self.stats["beta"].append(self.beta)
+
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            empty_cache()
+
+        kwargs = {}
+
+        # For LOMO optimizers you need to explicitly use the learning rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+
+        if self.use_apex:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            self.accelerator.backward(loss, **kwargs)
+
+        return loss.detach() / self.args.gradient_accumulation_steps
+
+    # Same as Trainer._maybe_log_save_evaluate but log our metrics
+    def _maybe_log_save_evaluate(
+        self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time, learning_rate=None
+    ):
+        if self.control.should_log and self.state.global_step > self._globalstep_last_logged:
+            logs: dict[str, float] = {}
+
+            # all_gather + mean() to get average loss over all processes
+            tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
+
+            # reset tr_loss to zero
+            tr_loss -= tr_loss
+
+            logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
+            if grad_norm is not None:
+                logs["grad_norm"] = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm
+            if learning_rate is not None:
+                logs["learning_rate"] = learning_rate
+            else:
+                logs["learning_rate"] = self._get_learning_rate()
+
+            # Add our metrics
+            for key, val in self.stats.items():
+                logs[key] = sum(val) / len(val)
+            self.stats = {key: [] for key in self.stats}  # reset stats
+
+            self._total_loss_scalar += tr_loss_scalar
+            self._globalstep_last_logged = self.state.global_step
+            self.store_flos()
+            self.log(logs, start_time)
+
+        metrics = None
+        if self.control.should_evaluate:
+            metrics = self._evaluate(trial, ignore_keys_for_eval)
+            is_new_best_metric = self._determine_best_metric(metrics=metrics, trial=trial)
+
+            if self.args.save_strategy == "best":
+                self.control.should_save = is_new_best_metric
+
+        if self.control.should_save:
+            self._save_checkpoint(model, trial)
+            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @article{guo2024direct,
+            title        = {{Direct Language Model Alignment from Online AI Feedback}},
+            author       = {Shangmin Guo and Biao Zhang and Tianlin Liu and Tianqi Liu and Misha Khalman and Felipe Llinares and Alexandre Ram{\'{e}} and Thomas Mesnard and Yao Zhao and Bilal Piot and Johan Ferret and Mathieu Blondel},
+            year         = 2024,
+            eprint       = {arXiv:2402.04792}
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="Online DPO",
+            trainer_citation=citation,
+            paper_title="Direct Language Model Alignment from Online AI Feedback",
+            paper_id="2402.04792",
+        )
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothOnlineDPOTrainer(_UnslothOnlineDPOTrainer):
+    """
+    
+Initialize OnlineDPOTrainer.
+
+Args:
+    model (`Union[str, nn.Module, PreTrainedModel]`):
+        Model to be trained. Can be either:
+
+        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+          path to a *directory* containing model weights saved using
+          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+          `args.model_init_kwargs`.
+        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+    ref_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`):
+        The reference model to use for training. If None is specified, the reference model will be created from the
+        model.
+    judge (`BasePairwiseJudge`):
+        The judge to use for pairwise comparison of model completions.
+    reward_funcs (`Union[RewardFunc, list[RewardFunc]]`, *optional*, defaults to `None`):
+        Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+        functions with the prompts and completions and sum the rewards. Can be either:
+
+        - A single reward function: Can be a string (path to model), a [`~transformers.PreTrainedModel`], or a
+          custom callable function.
+        - A list of reward functions: Must all be of compatible types.
+
+        Note: Only one of `judge`, or `reward_funcs` should be provided.
+    args (`OnlineDPOConfig`):
+        The online DPO config arguments to use for training.
+    data_collator (`transformers.DataCollator`):
+        The data collator to use for training. If None is specified, the default data collator
+        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+        sequences in the batch, given a dataset of paired sequences.
+    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+        The dataset to use for training.
+    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+        The dataset to use for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+        Processing class used to process the data. If provided, will be used to automatically process the inputs
+        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+        reuse the fine-tuned model.
+    reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
+        Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+        - A single processing class: Used when `reward_funcs` contains only one reward function.
+        - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+
+        If set to `None`, the tokenizer for each model-based reward function is automatically loaded using
+        [`~transformers.AutoTokenizer.from_pretrained`].
+    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+        metric values.
+    callbacks (`list[transformers.TrainerCallback]`):
+        The callbacks to use for training.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        The optimizer and scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+        The function to use to preprocess the logits before computing the metrics.
+
+.. deprecated:: 0.22.0
+    The following parameters are deprecated and will be removed in a future version:
+
+    * `reward_model`: Use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.
+    * `reward_processing_class`: Use `reward_processing_classes` instead. For example, change
+      `reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.
+
+    """
+    def __init__(
+        self,
+        model,
+        ref_model = None,
+        reward_funcs = None,
+        judge = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        reward_processing_classes = None,
+        peft_config = None,
+        compute_metrics = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        reward_model = None,
+        reward_processing_class = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothOnlineDPOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('online_dpo_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            ref_model = ref_model,
+            reward_funcs = reward_funcs,
+            judge = judge,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            reward_processing_classes = reward_processing_classes,
+            peft_config = peft_config,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            reward_model = reward_model,
+            reward_processing_class = reward_processing_class,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothPPOTrainer.py b/unsloth_compiled_cache/UnslothPPOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4daa950e44cb32d079f157a8d4ef12eb66250a3d
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothPPOTrainer.py
@@ -0,0 +1,1566 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.ppo_trainer import (Accelerator, BaseImageProcessor, CallbackHandler, DEFAULT_CALLBACKS, DEFAULT_PROGRESS_CALLBACK, DataCollatorWithPadding, DataLoader, Dataset, ExportableState, FeatureExtractionMixin, GenerationConfig, INVALID_LOGPROB, OnlineTrainerState, Optional, PPOConfig, PPOTrainer, Path, PeftConfig, PeftModel, PolicyAndValueWrapper, PreTrainedTokenizerBase, PrinterCallback, ProcessorMixin, Trainer, TrainerCallback, TrainerControl, Union, batch_generation, broadcast, contextmanager, create_reference_model, defaultdict, disable_dropout_in_model, empty_cache, exact_div, first_true_indices, forward, gather_object, gc, generate_model_card, get_comet_experiment_url, get_peft_model, get_reporting_integration_callbacks, get_reward, is_peft_available, is_rich_available, is_wandb_available, log_table_to_comet_experiment, masked_mean, masked_whiten, math, nn, np, nullcontext, os, pd, peft_module_casting_to_bf16, prepare_deepspeed, print_rich_table, selective_log_softmax, textwrap, time, torch, truncate_response, unwrap_model_for_generation, Optional, PeftModel, Trainer, is_peft_available, os, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothPPOConfig(PPOConfig):
+    """
+    
+Configuration class for the [`PPOTrainer`].
+
+This class includes only the parameters that are specific to PPO training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
+values in this class may differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`):
+        Name of this experiment.
+    reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
+        Path to the reward model.
+    model_adapter_name (`str` or `None`, *optional*, defaults to `None`):
+        Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+    ref_adapter_name (`str` or `None`, *optional*, defaults to `None`):
+        Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+    num_ppo_epochs (`int`, *optional*, defaults to `4`):
+        Number of epochs to train.
+    whiten_rewards (`bool`, *optional*, defaults to `False`):
+        Whether to whiten the rewards.
+    kl_coef (`float`, *optional*, defaults to `0.05`):
+        KL coefficient.
+    kl_estimator (`Literal["k1", "k3"]`, *optional*, defaults to `"k1"`):
+        Which estimator for KL-Divergence to use from [Approximating KL
+        Divergence](http://joschu.net/blog/kl-approx.html). Defaults to "k1", a straightforward, unbiased
+        estimator. Can be set to "k3", an unbiased estimator with lower variance which "appears to be a strictly
+        better estimator". Cannot be set to "k2", as it is used for logging purposes.
+    cliprange (`float`, *optional*, defaults to `0.2`):
+        Clip range.
+    vf_coef (`float`, *optional*, defaults to `0.1`):
+        Value function coefficient.
+    cliprange_value (`float`, *optional*, defaults to `0.2`):
+        Clip range for the value function.
+    gamma (`float`, *optional*, defaults to `1.0`):
+        Discount factor.
+    lam (`float`, *optional*, defaults to `0.95`):
+        Lambda value for GAE.
+    ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+        This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+        improving generation speed. However, disabling this option allows training models that exceed the VRAM
+        capacity of a single GPU, albeit at the cost of slower generation.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        dataset_num_proc = None,
+        num_mini_batches = 1,
+        total_episodes = None,
+        local_rollout_forward_batch_size = 64,
+        num_sample_generations = 10,
+        response_length = 53,
+        stop_token = None,
+        stop_token_id = None,
+        temperature = 0.7,
+        missing_eos_penalty = None,
+        sft_model_path = 'EleutherAI/pythia-160m',
+        world_size = None,
+        num_total_batches = None,
+        micro_batch_size = None,
+        local_batch_size = None,
+        batch_size = None,
+        local_mini_batch_size = None,
+        mini_batch_size = None,
+        exp_name = 'ppo_config',
+        reward_model_path = 'EleutherAI/pythia-160m',
+        model_adapter_name = None,
+        ref_adapter_name = None,
+        num_ppo_epochs = 4,
+        whiten_rewards = False,
+        kl_coef = 0.05,
+        kl_estimator = 'k1',
+        cliprange = 0.2,
+        vf_coef = 0.1,
+        cliprange_value = 0.2,
+        gamma = 1.0,
+        lam = 0.95,
+        ds3_gather_for_generation = True,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        if temperature <= 0:
+            raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.')
+        elif temperature >= 10:
+            raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.')
+        
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            dataset_num_proc = dataset_num_proc,
+            num_mini_batches = num_mini_batches,
+            total_episodes = total_episodes,
+            local_rollout_forward_batch_size = local_rollout_forward_batch_size,
+            num_sample_generations = num_sample_generations,
+            response_length = response_length,
+            stop_token = stop_token,
+            stop_token_id = stop_token_id,
+            temperature = temperature,
+            missing_eos_penalty = missing_eos_penalty,
+            sft_model_path = sft_model_path,
+            world_size = world_size,
+            num_total_batches = num_total_batches,
+            micro_batch_size = micro_batch_size,
+            local_batch_size = local_batch_size,
+            batch_size = batch_size,
+            local_mini_batch_size = local_mini_batch_size,
+            mini_batch_size = mini_batch_size,
+            exp_name = exp_name,
+            reward_model_path = reward_model_path,
+            model_adapter_name = model_adapter_name,
+            ref_adapter_name = ref_adapter_name,
+            num_ppo_epochs = num_ppo_epochs,
+            whiten_rewards = whiten_rewards,
+            kl_coef = kl_coef,
+            kl_estimator = kl_estimator,
+            cliprange = cliprange,
+            vf_coef = vf_coef,
+            cliprange_value = cliprange_value,
+            gamma = gamma,
+            lam = lam,
+            ds3_gather_for_generation = ds3_gather_for_generation,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        
+pass
+
+class _UnslothPPOTrainer(Trainer):
+    """Trainer for Proximal Policy Optimization (PPO).
+
+    For details on PPO, see the paper: [Proximal Policy Optimization
+    Algorithms](https://huggingface.co/papers/1707.06347).
+
+    Args:
+        args ([`PPOConfig`]):
+            Training arguments.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`]):
+            Class to process the data.
+        model (`torch.nn.Module`):
+            Model to be trained. This is the policy model.
+        ref_model (`torch.nn.Module`, *optional*):
+            Reference model used to compute the KL divergence. If `None`, a copy of the policy model is created.
+        reward_model (`torch.nn.Module`):
+            Reward model used to compute the rewards.
+        train_dataset ([`~datasets.Dataset`]):
+            Dataset for training.
+        value_model (`torch.nn.Module`):
+            Value model used to predict the value of a state.
+        data_collator ([`~transformers.DataCollatorWithPadding`], *optional*):
+            Data collator to batch and pad samples from the dataset. If `None`, a default data collator is created
+            using the `processing_class`.
+        eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
+            Dataset for evaluation.
+        optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+            Tuple containing the optimizer and the learning rate scheduler to use for training. If `None`, the
+            optimizer and the learning rate scheduler are created using the
+            [`~transformers.Trainer.create_optimizer_and_scheduler`] method.
+        callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+            Callbacks to use during training.
+        peft_config ([`~peft.config.PeftConfig`], *optional*):
+            PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the policy `model`
+            will be wrapped with the specified PEFT adapter.
+    """
+
+    _tag_names = ["trl", "ppo"]
+
+    def __init__(
+        self,
+        args: PPOConfig,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ],
+        model: nn.Module,
+        ref_model: Optional[nn.Module],
+        reward_model: nn.Module,
+        train_dataset: Dataset,
+        value_model: nn.Module,
+        data_collator: Optional[DataCollatorWithPadding] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        # less commonly used
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        callbacks: Optional[list[TrainerCallback]] = None,
+        peft_config: Optional["PeftConfig"] = None,
+    ) -> None:
+        if ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must make a copy of it, or `None` if you use peft."
+            )
+
+        self.args = args
+        self.processing_class = processing_class
+        self.policy_model = model
+
+        # Define the collator if not provided
+        if data_collator is None:
+            data_collator = DataCollatorWithPadding(self.processing_class)
+
+        # Handle stop token settings: update policy model's generation_config to use provided stop token
+        if args.stop_token and args.stop_token_id:
+            raise ValueError("You cannot set both `stop_token` and `stop_token_id`.")
+        elif args.stop_token:
+            if args.stop_token == "eos":
+                self.policy_model.generation_config.eos_token_id = self.stop_token_id = processing_class.eos_token_id
+            else:
+                raise ValueError(
+                    f"Unknown `stop_token` {args.stop_token}. Allowed values are: `'eos'` and `None` (no stop token)."
+                )
+        else:
+            self.policy_model.generation_config.eos_token_id = self.stop_token_id = args.stop_token_id  # None or int
+
+        # Check that the kl estimator is valid
+        if self.args.kl_estimator not in {"k1", "k3"}:
+            raise ValueError(
+                "kl_estimator must be either 'k1' (straightforward, unbiased) or 'k3' (lower variance, unbiased, "
+                "appears to be a strictly better estimator). See "
+                "[Approximating KL Divergence](http://joschu.net/blog/kl-approx.html) for details."
+            )
+
+        # peft support
+        if not is_peft_available() and peft_config is not None:
+            raise ImportError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_confg, we merge and unload it first
+            if isinstance(self.policy_model, PeftModel):
+                self.policy_model = self.policy_model.merge_and_unload()
+
+            # get peft model with the given config
+            self.policy_model = get_peft_model(self.policy_model, peft_config)
+            if args.bf16 and getattr(self.policy_model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(self.policy_model)
+
+        self.is_peft_model = is_peft_available() and isinstance(self.policy_model, PeftModel)
+        self.model_adapter_name = args.model_adapter_name
+        self.ref_adapter_name = args.ref_adapter_name
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model:
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(self.policy_model)
+
+        self.reward_model = reward_model
+        self.train_dataset = train_dataset
+        self.train_dataset_len = len(train_dataset)
+        self.value_model = value_model
+        self.data_collator = data_collator
+        self.eval_dataset = eval_dataset
+        self.optimizer, self.lr_scheduler = optimizers
+        self.optimizer_cls_and_kwargs = None  # needed for transformers >= 4.47
+
+        #########
+        # calculate various batch sizes
+        #########
+        if args.total_episodes is None:  # allow the users to define episodes in terms of epochs.
+            args.total_episodes = int(args.num_train_epochs * self.train_dataset_len)
+        accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
+        self.accelerator = accelerator
+        args.world_size = accelerator.num_processes
+        args.local_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps
+        args.micro_batch_size = int(args.per_device_train_batch_size * args.world_size)
+        args.batch_size = int(args.local_batch_size * args.world_size)
+        args.mini_batch_size = exact_div(
+            args.batch_size, args.num_mini_batches, "`batch_size` must be a multiple of `num_mini_batches`"
+        )
+        args.local_mini_batch_size = exact_div(
+            args.local_batch_size, args.num_mini_batches, "`local_batch_size` must be a multiple of `num_mini_batches`"
+        )
+        if args.whiten_rewards:
+            assert args.local_mini_batch_size >= 8, (
+                f"Per-rank minibatch size {args.local_mini_batch_size} is insufficient for whitening"
+            )
+        # `per_rank_rollout_batch_size` is our `args.local_batch_size`
+        # `per_rank_minibatch_size` is our `args.local_mini_batch_size`
+        args.num_total_batches = math.ceil(
+            args.total_episodes / args.batch_size
+        )  # we may train for more than `total_episodes`
+        time_tensor = torch.tensor(int(time.time()), device=accelerator.device)
+        time_int = broadcast(time_tensor, 0).item()  # avoid different timestamps across processes
+        args.run_name = f"{args.exp_name}__{args.seed}__{time_int}"
+        self.local_seed = args.seed + accelerator.process_index * 100003  # Prime
+        if args.num_sample_generations > 0:
+            self.sample_generations_freq = max(1, args.num_total_batches // args.num_sample_generations)
+        self.local_dataloader_batch_size = args.local_batch_size
+
+        #########
+        # setup model, optimizer, and others
+        #########
+        for module in [self.policy_model, self.ref_model, self.value_model, self.reward_model]:
+            if module is not None:
+                disable_dropout_in_model(module)
+        self.model = PolicyAndValueWrapper(self.policy_model, self.value_model)
+        self.model.config = self.policy_model.config  # needed for pushing to hub
+        self.create_optimizer_and_scheduler(
+            num_training_steps=args.num_total_batches
+        )  # note that we are calling `self.lr_scheduler.step[]` manually only at the batch level
+
+        #########
+        ### trainer specifics
+        #########
+        default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
+        self.callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
+        self.callback_handler = CallbackHandler(
+            self.callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
+        )
+        self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
+        self.control = TrainerControl()
+        self.state = OnlineTrainerState(
+            is_local_process_zero=self.is_local_process_zero(),
+            is_world_process_zero=self.is_world_process_zero(),
+            stateful_callbacks=[
+                cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
+            ],
+        )
+        self.current_flos = 0
+        self.hp_search_backend = None
+        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+        # Create distant repo and output directory if needed
+        self.hub_model_id = None
+        if self.args.push_to_hub:
+            self.init_hf_repo()
+        if self.args.should_save:
+            os.makedirs(self.args.output_dir, exist_ok=True)
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        #########
+        ### setup dataloader
+        #########
+        self.dataloader = DataLoader(
+            self.train_dataset,
+            batch_size=self.local_dataloader_batch_size,
+            shuffle=True,
+            collate_fn=self.data_collator,
+            drop_last=True,  # needed; otherwise the last batch will be of ragged shape
+        )
+        # sync random states for DataLoader[shuffle=True] before `accelerator.prepare`
+        # see https://gist.github.com/vwxyzjn/2581bff1e48e185e0b85b6dfe1def79c
+        torch.manual_seed(args.seed)
+        self.model, self.optimizer, self.dataloader = accelerator.prepare(self.model, self.optimizer, self.dataloader)
+        torch.manual_seed(self.local_seed)  # reset the local seed again
+
+        self.eval_dataloader = DataLoader(
+            self.eval_dataset,
+            batch_size=args.per_device_eval_batch_size,
+            collate_fn=self.data_collator,
+            drop_last=True,
+        )  # no need to shuffle eval dataset
+        self.eval_dataloader = accelerator.prepare(self.eval_dataloader)
+
+        if self.is_deepspeed_enabled:
+            self.reward_model = prepare_deepspeed(
+                self.reward_model, args.per_device_train_batch_size, args.fp16, args.bf16
+            )
+
+            if self.ref_model is None:
+                if not self.is_peft_model:
+                    raise ValueError("No reference model and model is not a Peft model.")
+            else:
+                self.ref_model = prepare_deepspeed(
+                    self.ref_model, args.per_device_train_batch_size, args.fp16, args.bf16
+                )
+        else:
+            if self.ref_model is None:
+                if not self.is_peft_model:
+                    raise ValueError("No reference model and model is not a Peft model.")
+            else:
+                self.ref_model = self.ref_model.to(self.accelerator.device)
+            self.reward_model = self.reward_model.to(self.accelerator.device)
+
+    def get_train_dataloader(self) -> DataLoader:
+        return self.dataloader
+
+    def get_eval_dataloader(self) -> DataLoader:
+        return self.eval_dataloader
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model.policy).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.policy.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.policy.set_adapter(self.model_adapter_name or "default")
+
+    def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
+        backup_model = self.model
+        self.model = self.model.policy  # save only the policy
+
+        if self.is_deepspeed_enabled:
+            backup_deepspeed = self.deepspeed
+            self.deepspeed = self.model
+
+        super().save_model(output_dir, _internal_call)
+
+        self.model = backup_model
+
+        if self.is_deepspeed_enabled:
+            self.deepspeed = backup_deepspeed
+
+    def train(self):
+        args = self.args
+        accelerator = self.accelerator
+        optimizer = self.optimizer
+        model = self.model
+        ref_policy = self.ref_model
+        reward_model = self.reward_model
+        processing_class = self.processing_class
+        dataloader = self.dataloader
+        device = accelerator.device
+
+        def repeat_generator():
+            while True:
+                yield from dataloader
+
+        iter_dataloader = iter(repeat_generator())
+        generation_config = GenerationConfig(
+            max_new_tokens=args.response_length,
+            temperature=(args.temperature + 1e-7),
+            top_k=0.0,
+            top_p=1.0,
+            do_sample=True,
+        )
+
+        accelerator.print("===training policy===")
+        start_time = time.time()
+        stats_shape = (args.num_ppo_epochs, args.num_mini_batches, args.gradient_accumulation_steps)
+        approxkl_stats = torch.zeros(stats_shape, device=device)
+        pg_clipfrac_stats = torch.zeros(stats_shape, device=device)
+        pg_loss_stats = torch.zeros(stats_shape, device=device)
+        vf_loss_stats = torch.zeros(stats_shape, device=device)
+        vf_clipfrac_stats = torch.zeros(stats_shape, device=device)
+        entropy_stats = torch.zeros(stats_shape, device=device)
+        ratio_stats = torch.zeros(stats_shape, device=device)
+        model.train()
+
+        # trainer state initialization
+        self.state.global_step = 0
+        self.state.episode = 0
+        self.state.max_steps = args.num_total_batches
+        self.state.num_train_epochs = args.total_episodes / self.train_dataset_len
+        # Compute absolute values for logging, eval, and save if given as ratio
+        if args.logging_steps is not None:
+            if args.logging_steps < 1:
+                self.state.logging_steps = math.ceil(self.state.max_steps * args.logging_steps)
+            else:
+                self.state.logging_steps = args.logging_steps
+        if args.eval_steps is not None:
+            if args.eval_steps < 1:
+                self.state.eval_steps = math.ceil(self.state.max_steps * args.eval_steps)
+            else:
+                self.state.eval_steps = args.eval_steps
+        if args.save_steps is not None:
+            if args.save_steps < 1:
+                self.state.save_steps = math.ceil(self.state.max_steps * args.save_steps)
+            else:
+                self.state.save_steps = args.save_steps
+        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
+
+        # backward compatibility
+        if self.is_deepspeed_enabled:
+            self.deepspeed = self.model
+            self.model_wrapped = self.model
+
+        for update in range(1, args.num_total_batches + 1):
+            self.state.episode += 1 * args.batch_size
+            data = next(iter_dataloader)
+            with torch.no_grad():
+                queries = data["input_ids"].to(device)
+                context_length = queries.shape[1]
+                responses = []
+                postprocessed_responses = []
+                logprobs = []
+                ref_logprobs = []
+                scores = []
+                sequence_lengths = []
+                values = []
+                with unwrap_model_for_generation(
+                    self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+                ) as unwrapped_model:
+                    query_responses, logitss = batch_generation(
+                        unwrapped_model.policy,
+                        queries,
+                        args.local_rollout_forward_batch_size,
+                        processing_class.pad_token_id,
+                        generation_config,
+                    )
+
+                for i in range(0, queries.shape[0], args.local_rollout_forward_batch_size):
+                    query = queries[i : i + args.local_rollout_forward_batch_size]
+                    query_response = query_responses[i : i + args.local_rollout_forward_batch_size]
+                    response = query_response[:, context_length:]
+                    logits = logitss[i : i + args.local_rollout_forward_batch_size]
+                    logprob = selective_log_softmax(logits, response)
+                    del logits
+                    empty_cache()
+
+                    if ref_policy is None:
+                        with self.null_ref_context():
+                            ref_output = forward(model.policy, query_response, processing_class.pad_token_id)
+                    else:
+                        ref_output = forward(ref_policy, query_response, processing_class.pad_token_id)
+                    ref_logits = ref_output.logits[:, context_length - 1 : -1]
+                    ref_logits /= args.temperature + 1e-7
+                    ref_logprob = selective_log_softmax(ref_logits, response)
+                    del ref_output, ref_logits
+                    empty_cache()
+
+                    # Response Processing 1. truncate response after the first occurrence of `stop_token_id`
+                    postprocessed_response = response
+                    if self.stop_token_id is not None:  # handle the edge case when stop_token_id exists but is 0
+                        postprocessed_response = truncate_response(
+                            self.stop_token_id, processing_class.pad_token_id, response
+                        )
+
+                    # Response Processing 2. run reward model on the truncated responses
+                    postprocessed_query_response = torch.cat((query, postprocessed_response), 1)
+                    sequence_length = first_true_indices(postprocessed_response == processing_class.pad_token_id) - 1
+                    unwrapped_value_model = accelerator.unwrap_model(model).value_model
+                    full_value, _, _ = get_reward(
+                        unwrapped_value_model, query_response, processing_class.pad_token_id, context_length
+                    )
+                    value = full_value[:, context_length - 1 : -1].squeeze(-1)
+                    _, score, _ = get_reward(
+                        reward_model, postprocessed_query_response, processing_class.pad_token_id, context_length
+                    )
+
+                    responses.append(response)
+                    postprocessed_responses.append(postprocessed_response)
+                    logprobs.append(logprob)
+                    ref_logprobs.append(ref_logprob)
+                    sequence_lengths.append(sequence_length)
+                    scores.append(score)
+                    values.append(value)
+                responses = torch.cat(responses, 0)
+                postprocessed_responses = torch.cat(postprocessed_responses, 0)
+                logprobs = torch.cat(logprobs, 0)
+                ref_logprobs = torch.cat(ref_logprobs, 0)
+                sequence_lengths = torch.cat(sequence_lengths, 0)
+                scores = torch.cat(scores, 0)
+                values = torch.cat(values, 0)
+                del (logprob, ref_logprob, full_value, value, score, unwrapped_model)
+                empty_cache()
+                gc.collect()
+
+                # Response Processing 3. Filter completion. Ensure that the sample contains stop_token_id
+                # Completions not passing that filter will receive a lower score.
+                contain_eos_token = torch.any(postprocessed_responses == self.processing_class.eos_token_id, dim=-1)
+                if self.args.missing_eos_penalty is not None:
+                    scores[~contain_eos_token] -= self.args.missing_eos_penalty
+                # accelerator.print(f"{scores=}, {(contain_eos_token.sum() / len(contain_eos_token))=}")
+
+                # be very careful with `padding_mask_p1`; see https://excalidraw.com/#json=LWnzG4w2k5DjF_EOL_xPt,e2w3a-hFJ_gX5vOfeyXGTw
+                response_idxs = torch.arange(responses.shape[1], device=responses.device).repeat(responses.shape[0], 1)
+                padding_mask = response_idxs > sequence_lengths.unsqueeze(1)
+                logprobs = torch.masked_fill(logprobs, padding_mask, INVALID_LOGPROB)
+                ref_logprobs = torch.masked_fill(ref_logprobs, padding_mask, INVALID_LOGPROB)
+                sequence_lengths_p1 = sequence_lengths + 1
+                padding_mask_p1 = response_idxs > (sequence_lengths_p1.unsqueeze(1))
+                values = torch.masked_fill(values, padding_mask_p1, 0)
+
+                # 4. compute rewards
+                # Formula used by http://joschu.net/blog/kl-approx.html for the k1 and k3 estimators
+                logr = ref_logprobs - logprobs
+                kl = -logr if args.kl_estimator == "k1" else (logr.exp() - 1) - logr  # Else statement is k3
+                non_score_reward = -args.kl_coef * kl
+                rewards = non_score_reward.clone()
+                actual_start = torch.arange(rewards.size(0), device=rewards.device)
+                actual_end = torch.where(sequence_lengths_p1 < rewards.size(1), sequence_lengths_p1, sequence_lengths)
+                rewards[[actual_start, actual_end]] += scores
+
+                # 5. whiten rewards
+                if args.whiten_rewards:
+                    rewards = masked_whiten(rewards, mask=~padding_mask_p1, shift_mean=False)
+                    rewards = torch.masked_fill(rewards, padding_mask_p1, 0)
+
+                # 6. compute advantages and returns
+                lastgaelam = 0
+                advantages_reversed = []
+                gen_length = responses.shape[1]
+                for t in reversed(range(gen_length)):
+                    nextvalues = values[:, t + 1] if t < gen_length - 1 else 0.0
+                    delta = rewards[:, t] + args.gamma * nextvalues - values[:, t]
+                    lastgaelam = delta + args.gamma * args.lam * lastgaelam
+                    advantages_reversed.append(lastgaelam)
+                advantages = torch.stack(advantages_reversed[::-1], axis=1)
+                returns = advantages + values
+                advantages = masked_whiten(advantages, ~padding_mask)
+                advantages = torch.masked_fill(advantages, padding_mask, 0)
+                empty_cache()
+
+            # Do multiple epochs of PPO training, with a fresh random shuffle in each epoch
+            for ppo_epoch_idx in range(args.num_ppo_epochs):
+                b_inds = np.random.permutation(args.local_batch_size)
+                minibatch_idx = 0
+                for mini_batch_start in range(0, args.local_batch_size, args.local_mini_batch_size):
+                    mini_batch_end = mini_batch_start + args.local_mini_batch_size
+                    mini_batch_inds = b_inds[mini_batch_start:mini_batch_end]
+                    gradient_accumulation_idx = 0
+                    for micro_batch_start in range(0, args.local_mini_batch_size, args.per_device_train_batch_size):
+                        with accelerator.accumulate(model):
+                            micro_batch_end = micro_batch_start + args.per_device_train_batch_size
+                            micro_batch_inds = mini_batch_inds[micro_batch_start:micro_batch_end]
+                            mb_advantage = advantages[micro_batch_inds]
+                            mb_responses = responses[micro_batch_inds]
+                            mb_query_responses = query_responses[micro_batch_inds]
+                            mb_logprobs = logprobs[micro_batch_inds]
+                            mb_return = returns[micro_batch_inds]
+                            mb_values = values[micro_batch_inds]
+
+                            output, vpred_temp = forward(model, mb_query_responses, processing_class.pad_token_id)
+                            logits = output.logits[:, context_length - 1 : -1]
+                            logits /= args.temperature + 1e-7
+                            new_logprobs = selective_log_softmax(logits, mb_responses)
+                            new_logprobs = torch.masked_fill(
+                                new_logprobs, padding_mask[micro_batch_inds], INVALID_LOGPROB
+                            )
+                            vpred = vpred_temp[:, context_length - 1 : -1].squeeze(-1)
+                            vpred = torch.masked_fill(vpred, padding_mask_p1[micro_batch_inds], 0)
+                            vpredclipped = torch.clamp(
+                                vpred,
+                                mb_values - args.cliprange_value,
+                                mb_values + args.cliprange_value,
+                            )
+                            vf_losses1 = torch.square(vpred - mb_return)
+                            vf_losses2 = torch.square(vpredclipped - mb_return)
+                            vf_loss_max = torch.max(vf_losses1, vf_losses2)
+                            vf_loss = 0.5 * masked_mean(vf_loss_max, ~padding_mask_p1[micro_batch_inds])
+                            vf_clipfrac = masked_mean(
+                                (vf_losses2 > vf_losses1).float(), ~padding_mask_p1[micro_batch_inds]
+                            )
+                            logprobs_diff = new_logprobs - mb_logprobs
+                            ratio = torch.exp(logprobs_diff)
+                            pg_losses = -mb_advantage * ratio
+                            pg_losses2 = -mb_advantage * torch.clamp(ratio, 1.0 - args.cliprange, 1.0 + args.cliprange)
+                            pg_loss_max = torch.max(pg_losses, pg_losses2)
+                            pg_loss = masked_mean(pg_loss_max, ~padding_mask[micro_batch_inds])
+                            loss = pg_loss + args.vf_coef * vf_loss
+                            accelerator.backward(loss)
+                            optimizer.step()
+                            optimizer.zero_grad()
+                            with torch.no_grad():
+                                pg_clipfrac = masked_mean(
+                                    (pg_losses2 > pg_losses).float(), ~padding_mask[micro_batch_inds]
+                                )
+                                prob_dist = torch.nn.functional.softmax(logits, dim=-1, dtype = torch.float32).to(logits.dtype)
+                                entropy = torch.logsumexp(logits, dim=-1) - torch.sum(prob_dist * logits, dim=-1)
+                                approxkl = 0.5 * (logprobs_diff**2).mean()
+                                approxkl_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = approxkl
+                                pg_clipfrac_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = (
+                                    pg_clipfrac
+                                )
+                                pg_loss_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = pg_loss
+                                vf_loss_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = vf_loss
+                                vf_clipfrac_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = (
+                                    vf_clipfrac
+                                )
+                                entropy_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = entropy.mean()
+                                ratio_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = ratio.mean()
+                        gradient_accumulation_idx += 1
+                    minibatch_idx += 1
+                    # del everything and empty cache
+                    # fmt: off
+                    del (
+                        output, vpred_temp, logits, new_logprobs, vpred, vpredclipped,
+                        vf_losses1, vf_losses2, vf_loss, vf_clipfrac, logprobs_diff, ratio, pg_losses, pg_losses2, pg_loss_max,
+                        pg_loss, loss, pg_clipfrac, prob_dist, entropy, approxkl, mb_return,
+                        mb_advantage, mb_values, mb_responses, mb_query_responses, mb_logprobs,
+                    )
+                    # fmt: on
+                    empty_cache()
+            with torch.no_grad():
+                mean_kl = kl.sum(1).mean()
+                mean_entropy = (-logprobs).sum(1).mean()
+                mean_non_score_reward = non_score_reward.sum(1).mean()
+                rlhf_reward = mean_non_score_reward + scores.mean()
+                eps = int(self.state.episode / (time.time() - start_time))
+                metrics = {}
+                metrics["eps"] = eps
+                metrics["objective/kl"] = self.accelerator.gather_for_metrics(mean_kl).mean().item()
+                metrics["objective/entropy"] = self.accelerator.gather_for_metrics(mean_entropy).mean().item()
+                metrics["objective/non_score_reward"] = (
+                    self.accelerator.gather_for_metrics(mean_non_score_reward).mean().item()
+                )
+                metrics["objective/rlhf_reward"] = self.accelerator.gather_for_metrics(rlhf_reward).mean().item()
+                metrics["objective/scores"] = self.accelerator.gather_for_metrics(scores.mean()).mean().item()
+                metrics["policy/approxkl_avg"] = self.accelerator.gather_for_metrics(approxkl_stats).mean().item()
+                metrics["policy/clipfrac_avg"] = self.accelerator.gather_for_metrics(pg_clipfrac_stats).mean().item()
+                metrics["loss/policy_avg"] = self.accelerator.gather_for_metrics(pg_loss_stats).mean().item()
+                metrics["loss/value_avg"] = self.accelerator.gather_for_metrics(vf_loss_stats).mean().item()
+                metrics["val/clipfrac_avg"] = self.accelerator.gather_for_metrics(vf_clipfrac_stats).mean().item()
+                metrics["policy/entropy_avg"] = self.accelerator.gather_for_metrics(entropy_stats).mean().item()
+                metrics["val/ratio"] = self.accelerator.gather_for_metrics(ratio_stats).mean().item()
+                metrics["val/ratio_var"] = self.accelerator.gather_for_metrics(ratio_stats).var().item()
+                metrics["val/num_eos_tokens"] = (responses == processing_class.eos_token_id).sum().item()
+                metrics["lr"] = self.lr_scheduler.get_last_lr()[0]
+                metrics["episode"] = self.state.episode
+                self.state.epoch = self.state.episode / self.train_dataset_len  # used by self.log
+                self.state.global_step += 1
+                self.log(metrics)
+
+            self.lr_scheduler.step()
+            self.control = self.callback_handler.on_step_end(args, self.state, self.control)
+            if self.control.should_save:
+                self._save_checkpoint(model, trial=None)
+                self.control = self.callback_handler.on_save(self.args, self.state, self.control)
+            del kl, mean_kl, mean_entropy, mean_non_score_reward, scores, metrics, non_score_reward
+            empty_cache()
+            gc.collect()
+
+            if args.num_sample_generations > 0 and (update - 1) % self.sample_generations_freq == 0:
+                self.generate_completions(sampling=True)
+                empty_cache()
+            del (
+                query_responses,
+                responses,
+                postprocessed_responses,
+                logprobs,
+                ref_logprobs,
+                values,
+                sequence_lengths,
+                contain_eos_token,
+                sequence_lengths_p1,
+                response_idxs,
+                padding_mask,
+                padding_mask_p1,
+                rewards,
+                actual_start,
+                actual_end,
+                advantages,
+                returns,
+            )
+            empty_cache()
+
+        # HF trainer specifics
+        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
+        if self.control.should_save:
+            self._save_checkpoint(model, trial=None)
+            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
+
+    def generate_completions(self, sampling: bool = False):
+        args = self.args
+        processing_class = self.processing_class
+        generation_config = GenerationConfig(
+            max_new_tokens=self.args.response_length,
+            temperature=(0.01 + 1e-7),
+            top_k=0.0,
+            top_p=1.0,
+            do_sample=True,
+        )
+
+        table = defaultdict(list)
+        with unwrap_model_for_generation(
+            self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+        ) as unwrapped_model:
+            for batch in self.eval_dataloader:
+                query = batch["input_ids"]
+                with torch.no_grad():
+                    context_length = query.shape[1]
+                    query_response, _ = batch_generation(
+                        unwrapped_model.policy,
+                        query,
+                        query.shape[0],
+                        processing_class.pad_token_id,
+                        generation_config,
+                    )
+                    response = query_response[:, context_length:]
+                    postprocessed_response = response
+                    if self.stop_token_id is not None:  # handle the edge case when stop_token_id exists but is 0
+                        postprocessed_response = truncate_response(
+                            self.stop_token_id, processing_class.pad_token_id, response
+                        )
+                    table["query"].extend(
+                        gather_object(processing_class.batch_decode(query, skip_special_tokens=True))
+                    )
+                    table["model response"].extend(
+                        gather_object(processing_class.batch_decode(postprocessed_response))
+                    )
+
+                    postprocessed_query_response = torch.cat((query, postprocessed_response), 1)
+                    _, score, _ = get_reward(
+                        self.reward_model, postprocessed_query_response, processing_class.pad_token_id, context_length
+                    )
+                    table["score"].extend(self.accelerator.gather_for_metrics(score).float().cpu().numpy())
+
+                if sampling:
+                    break
+        df = pd.DataFrame(table)
+
+        if self.accelerator.is_main_process:
+            if is_rich_available():
+                print_rich_table(df.iloc[0 : 0 + 5])
+            if "wandb" in args.report_to:
+                import wandb
+
+                if wandb.run is not None:
+                    wandb.log({"completions": wandb.Table(dataframe=df)})
+
+            if "comet_ml" in args.report_to:
+                log_table_to_comet_experiment(
+                    name="completions.csv",
+                    table=df,
+                )
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @article{mziegler2019fine-tuning,
+            title        = {{Fine-Tuning Language Models from Human Preferences}},
+            author       = {Daniel M. Ziegler and Nisan Stiennon and Jeffrey Wu and Tom B. Brown and Alec Radford and Dario Amodei and Paul F. Christiano and Geoffrey Irving},
+            year         = 2019,
+            eprint       = {arXiv:1909.08593}
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="PPO",
+            trainer_citation=citation,
+            paper_title="Fine-Tuning Language Models from Human Preferences",
+            paper_id="1909.08593",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothPPOTrainer(_UnslothPPOTrainer):
+    """
+    Trainer for Proximal Policy Optimization (PPO).
+
+For details on PPO, see the paper: [Proximal Policy Optimization
+Algorithms](https://huggingface.co/papers/1707.06347).
+
+Args:
+    args ([`PPOConfig`]):
+        Training arguments.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`]):
+        Class to process the data.
+    model (`torch.nn.Module`):
+        Model to be trained. This is the policy model.
+    ref_model (`torch.nn.Module`, *optional*):
+        Reference model used to compute the KL divergence. If `None`, a copy of the policy model is created.
+    reward_model (`torch.nn.Module`):
+        Reward model used to compute the rewards.
+    train_dataset ([`~datasets.Dataset`]):
+        Dataset for training.
+    value_model (`torch.nn.Module`):
+        Value model used to predict the value of a state.
+    data_collator ([`~transformers.DataCollatorWithPadding`], *optional*):
+        Data collator to batch and pad samples from the dataset. If `None`, a default data collator is created
+        using the `processing_class`.
+    eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
+        Dataset for evaluation.
+    optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+        Tuple containing the optimizer and the learning rate scheduler to use for training. If `None`, the
+        optimizer and the learning rate scheduler are created using the
+        [`~transformers.Trainer.create_optimizer_and_scheduler`] method.
+    callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+        Callbacks to use during training.
+    peft_config ([`~peft.config.PeftConfig`], *optional*):
+        PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the policy `model`
+        will be wrapped with the specified PEFT adapter.
+
+    """
+    def __init__(
+        self,
+        args,
+        processing_class,
+        model,
+        ref_model,
+        reward_model,
+        train_dataset,
+        value_model,
+        data_collator = None,
+        eval_dataset = None,
+        callbacks = None,
+        peft_config = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothPPOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('ppo_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            args = args,
+            processing_class = processing_class,
+            model = model,
+            ref_model = ref_model,
+            reward_model = reward_model,
+            train_dataset = train_dataset,
+            value_model = value_model,
+            data_collator = data_collator,
+            eval_dataset = eval_dataset,
+            callbacks = callbacks,
+            peft_config = peft_config,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
diff --git a/unsloth_compiled_cache/UnslothPRMTrainer.py b/unsloth_compiled_cache/UnslothPRMTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9c99f6d219a63c8b76283b0f89125292d91a906
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothPRMTrainer.py
@@ -0,0 +1,1038 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.prm_trainer import (BaseImageProcessor, Callable, DataCollator, DataCollatorForTokenClassification, Dataset, EvalPrediction, FeatureExtractionMixin, Optional, PRMConfig, PRMTrainer, PartialState, Path, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, Trainer, TrainerCallback, Union, chain, compute_accuracy, disable_dropout_in_model, features, generate_model_card, is_wandb_available, nn, os, prepare_peft_model, textwrap, torch, Optional, PreTrainedModel, Trainer, os, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothPRMConfig(PRMConfig):
+    """
+    
+Configuration class for the [`PRMTrainer`].
+
+This class includes only the parameters that are specific to PRM training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    max_length (`int` or `None`, *optional*, defaults to `1024`):
+        Maximum length of the sequences (prompt + completion) used for truncation.
+    max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+        Maximum length of the prompt used for truncation.
+    max_completion_length (`int` or `None`, *optional*, defaults to `None`):
+        Maximum length of the completion used for truncation. The completion is the concatenation of the steps.
+    disable_dropout (`bool`, *optional*, defaults to `True`):
+        Whether to disable dropout in the model.
+    step_separator (`str`, *optional*, defaults to `"\n"`):
+        Separator used to separate each step of the reasoning process.
+    train_on_last_step_only (`bool`, *optional*, defaults to `False`):
+        Whether to train only on the last step.
+    dataset_num_proc (`int`, *optional*, defaults to `None`):
+        Number of processes to use for processing the dataset.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        max_length = 1024,
+        max_prompt_length = 512,
+        max_completion_length = None,
+        disable_dropout = True,
+        step_separator = '\
+',
+        train_on_last_step_only = False,
+        dataset_num_proc = None,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            max_length = max_length,
+            max_prompt_length = max_prompt_length,
+            max_completion_length = max_completion_length,
+            disable_dropout = disable_dropout,
+            step_separator = step_separator,
+            train_on_last_step_only = train_on_last_step_only,
+            dataset_num_proc = dataset_num_proc,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothPRMTrainer(Trainer):
+    """
+    Initialize PRMTrainer.
+
+    Args:
+        model (`transformers.PreTrainedModel`):
+            The model to train, preferably an `AutoModelForTokenClassification`.
+        args (`PRMConfig`):
+            The arguments to use for training.
+        data_collator (`transformers.DataCollator`):
+            The data collator to use for training. If None is specified, the default data collator
+            (`DataCollatorForTokenClassification`) will be used which will pad the sequences to the maximum length of
+            the sequences in the batch, given a dataset of paired sequences.
+        train_dataset (`datasets.Dataset`):
+            The dataset to use for training.
+        eval_dataset (`datasets.Dataset`):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional* defaults to `compute_accuracy`):
+            The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`)
+            will be used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+    """
+
+    _tag_names = ["trl", "prm"]
+
+    def __init__(
+        self,
+        model: Optional[Union[PreTrainedModel, nn.Module]] = None,
+        args: Optional[PRMConfig] = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional[dict] = None,
+    ):
+        if False:
+            model = prepare_peft_model(model, peft_config, args)
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+
+        if compute_metrics is None:
+            compute_metrics = compute_accuracy
+
+        if data_collator is None:
+            if processing_class is None:
+                raise ValueError(
+                    "A processing_class must be specified when using the default DataCollatorForTokenClassification"
+                )
+            data_collator = DataCollatorForTokenClassification(processing_class, max_length=args.max_length)
+
+        if "input_ids" not in train_dataset.column_names:
+            with PartialState().main_process_first():
+                fn_kwargs = {
+                    "tokenizer": processing_class,
+                    "step_separator": args.step_separator,
+                    "max_length": args.max_length,
+                    "max_prompt_length": args.max_prompt_length,
+                    "max_completion_length": args.max_completion_length,
+                    "train_on_last_step_only": args.train_on_last_step_only,
+                }
+                train_fn_kwargs = {**fn_kwargs, "is_eval": False}
+                train_dataset = train_dataset.map(
+                    self.tokenize_row,
+                    fn_kwargs=train_fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                    remove_columns=train_dataset.features,
+                    desc="Tokenizing train dataset",
+                    features=features.Features(  # needed to avoid map to cast labels to bool
+                        {
+                            "labels": features.Sequence(features.Value("int64")),
+                            "input_ids": features.Sequence(features.Value("int64")),
+                        }
+                    ),
+                )
+
+                eval_fn_kwargs = {**fn_kwargs, "is_eval": True}
+                if eval_dataset is not None:
+                    eval_dataset = eval_dataset.map(
+                        self.tokenize_row,
+                        fn_kwargs=eval_fn_kwargs,
+                        num_proc=args.dataset_num_proc,
+                        remove_columns=eval_dataset.features,
+                        desc="Tokenizing eval dataset",
+                        features=features.Features(  # needed to avoid map to cast labels to bool
+                            {
+                                "labels": features.Sequence(features.Value("int64")),
+                                "input_ids": features.Sequence(features.Value("int64")),
+                            }
+                        ),
+                    )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+    @staticmethod
+    def tokenize_row(
+        features,
+        tokenizer,
+        step_separator,
+        max_length,
+        max_prompt_length,
+        max_completion_length,
+        train_on_last_step_only,
+        is_eval,
+    ):
+        r"""
+        Tokenize a row of the dataset.
+
+        Args:
+            features (`dict[str, str]`):
+                Row of the dataset, should contain the keys `"prompt"`, `"completions"`, and `"labels"`.
+            tokenizer (`PreTrainedTokenizerBase`):
+                Tokenizer used to process the data.
+            step_separator (`str`):
+                Separator between steps in the completion.
+            max_length (`int` or `None`):
+               Maximum length of the sequences (prompt + completion). If `None`, the sequences are not truncated.
+            max_prompt_length (`int` or `None`):
+                Maximum length of the prompt. If `None`, the prompt is not truncated.
+            max_completion_length (`int` or `None`):
+                Maximum length of the completion sequences. If `None`, the completion sequences are not truncated.
+            train_on_last_step_only (`bool`):
+                Whether to train only on the last step. If `True`, the labels are `-100` for all tokens except the last
+                token of the completion.
+            is_eval (`bool`):
+                Whether the function is used to tokenize samples from a training or an evaluation dataset. Used only if
+                `train_on_last_step_only` is set to `True`.
+
+        Returns:
+            `dict[str, list[int]]`:
+                Tokenized sequences with the keys `"input_ids"`, and `"labels".
+
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+        >>> features = {
+        ...     "prompt": "Which number is larger, 9.8 or 9.11?",
+        ...     "completions": ["11 is greater than 8.", "Hence, 9.11 > 9.8."],
+        ...     "labels": [True, False],
+        ... }
+        >>> PRMTrainer.tokenize_row(
+        ...     features, tokenizer, "\n", max_completion_length=None, train_on_last_step_only=False, is_eval=False
+        ... )
+        {'input_ids': [23085, 1372, 374, 8131, 11, 220, 24, 13, 23, 476, 220, 24, 13, 16, 16, 30, 16, 16, 374, 7046, 1091, 220, 23, 13, 198, 39, 763, 11, 220, 24, 13, 16, 16, 861, 220, 24, 13, 23, 13, 198],
+         'labels': [-100, -100, -100, -100, -100, -100, -100, -100, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 0]}
+        ```
+        """
+        # Tokenize the prompt and completions
+        prompt_ids = tokenizer(features["prompt"], add_special_tokens=False)["input_ids"]
+        completions_ids = [
+            tokenizer(completion, add_special_tokens=False)["input_ids"] for completion in features["completions"]
+        ]
+        if train_on_last_step_only and not is_eval:
+            labels = [-100] * (len(features["labels"]) - 1) + [int(features["labels"][-1])]
+        else:
+            labels = [int(label) for label in features["labels"]]
+
+        # Get the ID of the separator token and add it to the completions
+        separator_ids = tokenizer.encode(step_separator, add_special_tokens=False)
+        completions_ids = [completion + separator_ids for completion in completions_ids]
+
+        # Create the label
+        labels = [[-100] * (len(completion) - 1) + [label] for completion, label in zip(completions_ids, labels)]
+
+        # Join the completions and labels steps
+        completion_ids = list(chain(*completions_ids))
+        labels = list(chain(*labels))
+
+        if tokenizer.bos_token_id is not None:
+            prompt_ids = [tokenizer.bos_token_id] + prompt_ids
+
+        # Truncate prompt and completion sequences
+        if max_prompt_length is not None:
+            prompt_ids = prompt_ids[-max_prompt_length:]
+        if max_completion_length is not None:
+            completion_ids = completion_ids[:max_completion_length]
+            labels = labels[:max_completion_length]
+
+        input_ids = prompt_ids + completion_ids
+        labels = [-100] * len(prompt_ids) + labels
+
+        if max_length is not None:
+            input_ids = input_ids[:max_length]
+            labels = labels[:max_length]
+
+        return {"input_ids": input_ids, "labels": labels}
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @article{uesato2022solving,
+            title        = {{Solving Math Word Problems With Process- and Outcome-Based Feedback}},
+            author       = {Uesato, Jonathan and Kushman, Nate and Kumar, Ramana and Song, Francis and Siegel, Noah and Wang, Lisa and Creswell, Antonia and Irving, Geoffrey and Higgins, Irina},
+            year         = 2022,
+            journal      = {arXiv preprint arXiv:2211.14275}
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            trainer_name="PRM",
+            trainer_citation=citation,
+            paper_title="Solving math word problems with process-and outcome-based feedback",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothPRMTrainer(_UnslothPRMTrainer):
+    """
+    
+Initialize PRMTrainer.
+
+Args:
+    model (`transformers.PreTrainedModel`):
+        The model to train, preferably an `AutoModelForTokenClassification`.
+    args (`PRMConfig`):
+        The arguments to use for training.
+    data_collator (`transformers.DataCollator`):
+        The data collator to use for training. If None is specified, the default data collator
+        (`DataCollatorForTokenClassification`) will be used which will pad the sequences to the maximum length of
+        the sequences in the batch, given a dataset of paired sequences.
+    train_dataset (`datasets.Dataset`):
+        The dataset to use for training.
+    eval_dataset (`datasets.Dataset`):
+        The dataset to use for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+        Processing class used to process the data. If provided, will be used to automatically process the inputs
+        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+        reuse the fine-tuned model.
+    model_init (`Callable[[], transformers.PreTrainedModel]`):
+        The model initializer to use for training. If None is specified, the default model initializer will be
+        used.
+    compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional* defaults to `compute_accuracy`):
+        The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`)
+        will be used.
+    callbacks (`list[transformers.TrainerCallback]`):
+        The callbacks to use for training.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        The optimizer and scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+        The function to use to preprocess the logits before computing the metrics.
+    peft_config (`dict`, defaults to `None`):
+        The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+        a PEFT model.
+
+    """
+    def __init__(
+        self,
+        model = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        model_init = None,
+        compute_metrics = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        peft_config = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothPRMConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('prm_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            model_init = model_init,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            peft_config = peft_config,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
diff --git a/unsloth_compiled_cache/UnslothRLOOTrainer.py b/unsloth_compiled_cache/UnslothRLOOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c9d2ebe2a8eb1d32666087e6ee9da4a102184f
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothRLOOTrainer.py
@@ -0,0 +1,2465 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.rloo_trainer import (Any, AutoConfig, AutoModelForSequenceClassification, AutoProcessor, AutoTokenizer, DataLoader, Dataset, FSDP, GenerationConfig, IterableDataset, Optional, Path, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RLOOConfig, RLOOTrainer, RepeatSampler, RewardFunc, Sampler, SyncRefModelCallback, Trainer, TrainerCallback, Union, VLLMClient, apply_chat_template, broadcast_object_list, copy, datasets, defaultdict, deque, disable_dropout_in_model, entropy_from_logits, gather, gather_object, generate_model_card, get_comet_experiment_url, identity, inspect, is_conversational, is_datasets_available, is_flash_attn_2_available, is_peft_model, is_rich_available, is_vllm_available, is_wandb_available, logger, logging, maybe_apply_chat_template, nanmax, nanmin, nanstd, nn, nullcontext, os, pad, partial, prepare_deepspeed, prepare_fsdp, prepare_peft_model, print_prompt_completions_sample, profiling_context, profiling_decorator, re, seed_worker, selective_log_softmax, set_seed, shuffle_sequence_dict, split_pixel_values_by_grid, split_tensor_dict, textwrap, torch, transformers, truncate_with_protected_tokens, unsplit_pixel_values_by_grid, unwrap_model_for_generation, warnings, Any, FSDP, Union, apply_chat_template, broadcast_object_list, copy, gather, gather_object, is_conversational, is_flash_attn_2_available, logging, maybe_apply_chat_template, nanstd, nullcontext, os, pad, profiling_context, re, torch, transformers, truncate_with_protected_tokens, unwrap_model_for_generation, FSDP, gather, is_peft_model, nn, nullcontext, os, profiling_decorator, re, Any, Union, profiling_decorator, re, shuffle_sequence_dict, split_pixel_values_by_grid, split_tensor_dict, torch, unsplit_pixel_values_by_grid, Optional, PreTrainedModel, Trainer, logger, os, re, torch, FSDP, nn, os, re, FSDP, nn, re, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+def vLLMSamplingParams(**kwargs):
+    from vllm import SamplingParams
+    sampling_params = SamplingParams(**kwargs)
+    sampling_params._set_kwargs = kwargs
+    return sampling_params
+@dataclass
+class UnslothRLOOConfig(RLOOConfig):
+    """
+    
+Configuration class for the [`RLOOTrainer`].
+
+This class includes only the parameters that are specific to RLOO training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    > Parameters that control the model and reference model
+
+    model_init_kwargs (`str`, `dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
+        argument of the [`GRPOTrainer`] is provided as a string.
+    disable_dropout (`bool`, *optional*, defaults to `False`):
+        Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents
+        the model from generating different logprobs for the same input.
+
+    > Parameters that control the data preprocessing
+
+    remove_unused_columns (`bool`, *optional*, defaults to `False`):
+        Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that
+        requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`.
+    max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+        Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left.
+    num_generations (`int` or `None`, *optional*, defaults to `2`):
+        Number of generations per prompt to sample. The effective batch size (num_processes * per_device_batch_size
+        * gradient_accumulation_steps) must be evenly divisible by this value.
+    max_completion_length (`int` or `None`, *optional*, defaults to `256`):
+        Maximum length of the generated completion.
+    ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+        This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+        improving generation speed. However, disabling this option allows training models that exceed the VRAM
+        capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
+        with vLLM generation.
+    shuffle_dataset (`bool`, *optional*, defaults to `True`):
+        Whether to shuffle the training dataset.
+
+    > Parameters that control generation
+
+    generation_batch_size: (`int` or `None`, *optional*, defaults to `None`):
+        Batch size to use for generation. If `None`, it defaults to the effective training batch size:
+        `per_device_train_batch_size * num_processes * steps_per_generation`. In other words, there is one
+        generation batch processed per optimization step. Mutually exclusive with `steps_per_generation`.
+    steps_per_generation: (`int` or `None`, *optional*, defaults to `None`):
+        Number of steps per generation. If `None`, it defaults to `gradient_accumulation_steps`. Mutually exclusive
+        with `generation_batch_size`.
+    temperature (`float`, defaults to `1.0`):
+        Temperature for sampling. The higher the temperature, the more random the completions.
+    top_p (`float`, *optional*, defaults to `1.0`):
+        Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
+        `1.0` to consider all tokens.
+    top_k (`int` or `None`, *optional*, defaults to `None`):
+        Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is
+        disabled and all tokens are considered.
+    min_p (`float` or `None`, *optional*, defaults to `None`):
+        Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
+        value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range.
+    repetition_penalty (`float`, *optional*, defaults to `1.0`):
+        Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
+        Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat
+        tokens.
+    use_transformers_paged (`bool`, *optional*, defaults to `False`):
+        Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers`
+        paged implementation will be used for generation instead of the default padded implementation. This
+        parameter is only effective when `use_vllm` is set to `False`.
+    cache_implementation (`str` or `None`, *optional*, defaults to `None`):
+        Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
+    generation_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or `SamplingParams` (if
+        using vLLM) when sampling completions. This can be used to further customize the generation behavior, such
+        as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that conflict with the other generation
+        parameters (like `min_p`, `top_p`, etc.), they will override them.
+
+    > Parameters that control generation acceleration powered by vLLM
+
+    use_vllm (`bool`, *optional*, defaults to `False`):
+        Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation
+        instead of the default model.generate(). Requires `vllm` to be installed.
+    vllm_mode (`str`, *optional*, defaults to `"server"`):
+        Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or
+        `"colocate"`.
+
+        - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
+          server is running (start with `trl vllm-serve`).
+        - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a
+          separate server but may cause resource contention with training.
+    vllm_model_impl (`str`, *optional*, defaults to `"vllm"`):
+        Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use
+        the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model
+        implementation.
+    vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
+        Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.
+
+    > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)
+
+    vllm_server_base_url (`str` or `None`, *optional*, defaults to `None`):
+        Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and
+        `vllm_server_port` are ignored.
+    vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
+        Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+    vllm_server_port (`int`, *optional*, defaults to `8000`):
+        Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+    vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
+        Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
+        timeout, a `ConnectionError` is raised.
+
+    > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)
+
+    vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.3`):
+        Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to
+        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+        launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.
+    vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
+        Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to
+        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+        launching the vLLM server via the `--vllm_tensor_parallel_size` flag.
+
+    > Parameters that control the training
+
+    beta (`float`, *optional*, defaults to `0.05`):
+        KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving training
+        speed.
+    num_iterations (`int`, *optional*, defaults to `1`):
+        Number of iterations per batch (denoted as μ in the algorithm).
+    epsilon (`float`, *optional*, defaults to `0.2`):
+        Epsilon value for clipping.
+    epsilon_high (`float` or `None`, *optional*, defaults to `None`):
+        Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound
+        specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`.
+    reward_weights (`list[float]` or `None`, *optional*, defaults to `None`):
+        Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are
+        weighted equally with weight `1.0`.
+    normalize_advantages (`bool`, *optional*, defaults to `False`):
+        Whether to normalize advantages. Normalization is done per generation batch to have mean `0.0` and standard
+        deviation of `1.0`.
+    reward_clip_range (`tuple[float, float]` or `None`, *optional*, defaults to `None`):
+        Clip range for rewards as (min, max). If `None`, no clipping is applied.
+    mask_truncated_completions (`bool`, *optional*, defaults to `False`):
+        When enabled, truncated completions are excluded from the loss calculation, preventing them from being
+        incorrectly penalized and introducing noise during training. According to the
+        [DAPO](https://huggingface.co/papers/2503.14476) paper, this is a good practice for training stability.
+    sync_ref_model (`bool`, *optional*, defaults to `False`):
+        Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using
+        the `ref_model_mixup_alpha` parameter. This synchronization originates from the
+        [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
+    ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`):
+        α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix
+        between the current policy and the previous reference policy during updates. The reference policy is
+        updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you
+        must set `sync_ref_model=True`.
+    ref_model_sync_steps (`int`, *optional*, defaults to `512`):
+        τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
+        frequently the current policy is synchronized with the reference policy. To use this parameter, you must
+        set `sync_ref_model=True`.
+
+    > Parameters that control the logging
+
+    log_completions (`bool`, *optional*, defaults to `False`):
+        Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is installed,
+        it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`.
+    num_completions_to_print (`int` or `None`, *optional*, defaults to `None`):
+        Number of completions to print with `rich`. If `None`, all completions are logged.
+    wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`):
+        Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts
+        are logged.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = False,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        model_init_kwargs = None,
+        disable_dropout = False,
+        max_prompt_length = 512,
+        num_generations = 8,
+        max_completion_length = 256,
+        ds3_gather_for_generation = True,
+        shuffle_dataset = True,
+        generation_batch_size = None,
+        steps_per_generation = None,
+        temperature = 1.0,
+        top_p = 1.0,
+        top_k = None,
+        min_p = None,
+        generation_kwargs = {},
+        repetition_penalty = 1.0,
+        use_transformers_paged = False,
+        cache_implementation = None,
+        use_vllm = False,
+        vllm_mode = 'colocate',
+        vllm_model_impl = 'vllm',
+        vllm_guided_decoding_regex = None,
+        vllm_server_base_url = None,
+        vllm_server_host = '0.0.0.0',
+        vllm_server_port = 8000,
+        vllm_server_timeout = 240.0,
+        vllm_gpu_memory_utilization = 0.3,
+        vllm_tensor_parallel_size = 1,
+        beta = 0.05,
+        num_iterations = 1,
+        epsilon = 0.2,
+        epsilon_high = None,
+        reward_weights = None,
+        normalize_advantages = False,
+        reward_clip_range = None,
+        mask_truncated_completions = False,
+        sync_ref_model = False,
+        ref_model_mixup_alpha = 0.6,
+        ref_model_sync_steps = 512,
+        log_completions = False,
+        num_completions_to_print = None,
+        wandb_log_unique_prompts = False,
+        rloo_k = None,
+        cliprange = None,
+        kl_coef = None,
+        exp_name = None,
+        normalize_reward = None,
+        num_ppo_epochs = None,
+        num_mini_batches = None,
+        total_episodes = None,
+        response_length = None,
+        token_level_kl = None,
+        dataset_num_proc = None,
+        local_rollout_forward_batch_size = None,
+        num_sample_generations = None,
+        stop_token = None,
+        stop_token_id = None,
+        missing_eos_penalty = None,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        if (per_device_train_batch_size // num_generations) * num_generations != per_device_train_batch_size:
+            print('Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.\nWe will change the batch size of ' + str(per_device_train_batch_size) + ' to the `num_generations` of ' + str(num_generations))
+            per_device_train_batch_size = num_generations
+        
+        if temperature <= 0:
+            raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.')
+        elif temperature >= 10:
+            raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.')
+        
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            model_init_kwargs = model_init_kwargs,
+            disable_dropout = disable_dropout,
+            max_prompt_length = max_prompt_length,
+            num_generations = num_generations,
+            max_completion_length = max_completion_length,
+            ds3_gather_for_generation = ds3_gather_for_generation,
+            shuffle_dataset = shuffle_dataset,
+            generation_batch_size = generation_batch_size,
+            steps_per_generation = steps_per_generation,
+            temperature = temperature,
+            top_p = top_p,
+            top_k = top_k,
+            min_p = min_p,
+            generation_kwargs = generation_kwargs,
+            repetition_penalty = repetition_penalty,
+            use_transformers_paged = use_transformers_paged,
+            cache_implementation = cache_implementation,
+            use_vllm = use_vllm,
+            vllm_mode = vllm_mode,
+            vllm_model_impl = vllm_model_impl,
+            vllm_guided_decoding_regex = vllm_guided_decoding_regex,
+            vllm_server_base_url = vllm_server_base_url,
+            vllm_server_host = vllm_server_host,
+            vllm_server_port = vllm_server_port,
+            vllm_server_timeout = vllm_server_timeout,
+            vllm_gpu_memory_utilization = vllm_gpu_memory_utilization,
+            vllm_tensor_parallel_size = vllm_tensor_parallel_size,
+            beta = beta,
+            num_iterations = num_iterations,
+            epsilon = epsilon,
+            epsilon_high = epsilon_high,
+            reward_weights = reward_weights,
+            normalize_advantages = normalize_advantages,
+            reward_clip_range = reward_clip_range,
+            mask_truncated_completions = mask_truncated_completions,
+            sync_ref_model = sync_ref_model,
+            ref_model_mixup_alpha = ref_model_mixup_alpha,
+            ref_model_sync_steps = ref_model_sync_steps,
+            log_completions = log_completions,
+            num_completions_to_print = num_completions_to_print,
+            wandb_log_unique_prompts = wandb_log_unique_prompts,
+            rloo_k = rloo_k,
+            cliprange = cliprange,
+            kl_coef = kl_coef,
+            exp_name = exp_name,
+            normalize_reward = normalize_reward,
+            num_ppo_epochs = num_ppo_epochs,
+            num_mini_batches = num_mini_batches,
+            total_episodes = total_episodes,
+            response_length = response_length,
+            token_level_kl = token_level_kl,
+            dataset_num_proc = dataset_num_proc,
+            local_rollout_forward_batch_size = local_rollout_forward_batch_size,
+            num_sample_generations = num_sample_generations,
+            stop_token = stop_token,
+            stop_token_id = stop_token_id,
+            missing_eos_penalty = missing_eos_penalty,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        
+pass
+
+class _UnslothRLOOTrainer(Trainer):
+    """
+    Trainer for the Reinforce Leave One Out (RLOO) method. This algorithm was initially proposed in the paper [Back to
+    Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in LLMs]
+    (https://huggingface.co/papers/2402.14740).
+
+    Example:
+
+    ```python
+    from datasets import load_dataset
+    from trl import RLOOTrainer
+
+    dataset = load_dataset("trl-lib/tldr", split="train")
+    def reward_func(completions, **kwargs):
+        # Dummy reward function that rewards completions with more unique letters.
+        return [float(len(set(completion))) for completion in completions]
+    trainer = RLOOTrainer(
+        model="Qwen/Qwen2-0.5B-Instruct",
+        reward_funcs=reward_func,
+        train_dataset=dataset,
+    )
+
+    trainer.train()
+    ```
+
+    Args:
+        model (`Union[str, PreTrainedModel]`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
+            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+            functions with the prompts and completions and sum the rewards. Can be either:
+
+            - A single reward function, such as:
+                - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
+                path to a *directory* containing model weights saved using
+                [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+                using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
+                keyword arguments in `args.model_init_kwargs`.
+                - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
+                - A custom reward function: The function is provided with the prompts and the generated completions,
+                  plus any additional columns in the dataset. It should return a list of rewards. Custom reward
+                  functions can also return `None` when the reward is not applicable to those samples. This is useful
+                  for multi-task training where different reward functions apply to different types of samples. When a
+                  reward function returns `None` for a sample, that reward function is excluded from the reward
+                  calculation for that sample. For more details, see [Using a custom reward
+                  function](#using-a-custom-reward-function).
+
+                  The trainer's state is also passed to the reward function. The trainer's state is an instance of
+                  [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
+                  reward function's signature.
+            - A list of reward functions, where each item can independently be any of the above types. Mixing different
+            types within the list (e.g., a string model ID and a custom reward function) is allowed.
+        args ([`RLOOConfig`], *optional*, defaults to `None`):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
+            ignored. The format of the samples can be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
+            Processing class used to process the data. The padding side must be set to "left". If `None`, the
+            processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
+            padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
+            `tokenizer.eos_token` will be used as the default.
+        reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
+            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+            - A single processing class: Used when `reward_funcs` contains only one reward function.
+            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+            If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
+            `None`, the tokenizer for the model is automatically loaded using
+            [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
+            functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
+            are ignored.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    """
+
+    _tag_names = ["trl", "rloo"]
+
+    def __init__(
+        self,
+        # Note for dev: we can remove the default None when we remove the deprecated model parameter in version 0.25.0
+        model: Union[str, PreTrainedModel] = None,
+        reward_funcs: Union[RewardFunc, list[RewardFunc]] = None,
+        args: Optional[RLOOConfig] = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None,
+        processing_class: Optional[Union[PreTrainedTokenizerBase, ProcessorMixin]] = None,
+        reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
+        peft_config: Optional["PeftConfig"] = None,
+        # Deprecated parameters
+        config=None,
+        reward_model=None,
+        policy=None,
+        ref_policy=None,
+        data_collator=None,
+    ):
+
+        if hasattr(model, 'vllm_engine') and hasattr(args, 'use_vllm'):
+            if (getattr(args, 'use_vllm', False) == False):
+                args.use_vllm = True
+        # Handle deprecated parameters
+        if config is not None:
+            warnings.warn(
+                "Parameter 'config' is deprecated and will be removed in version 0.25.0. Please use 'args' instead. "
+                "We are setting args=config"
+            )
+            if args is None:
+                args = config
+            else:
+                raise ValueError("Cannot specify both 'config' (deprecated) and 'args'. Please use 'args' only.")
+
+        if reward_model is not None:
+            warnings.warn(
+                "Parameter 'reward_model' is deprecated and will be removed in version 0.25.0. Please use "
+                "'reward_funcs' instead. We are setting reward_funcs=reward_model"
+            )
+            if reward_funcs is None:
+                reward_funcs = reward_model
+            else:
+                raise ValueError(
+                    "Cannot specify both 'reward_model' (deprecated) and 'reward_funcs'. Please use 'reward_funcs' "
+                    "only."
+                )
+        if policy is not None:
+            warnings.warn(
+                "Parameter 'policy' is deprecated and will be removed in version 0.25.0. Please use 'model' instead. "
+                "We are setting model=policy"
+            )
+            if model is None:
+                model = policy
+            else:
+                raise ValueError("Cannot specify both 'policy' (deprecated) and 'model'. Please use 'model' only.")
+        if ref_policy is not None:
+            warnings.warn(
+                "Parameter 'ref_policy' is deprecated and will be removed in version 0.25.0. To use the initial model "
+                "as the reference model, simply omit this parameter. The parameter is ignored."
+            )
+        if data_collator is not None:
+            warnings.warn(
+                "Parameter 'data_collator' is deprecated and will be removed in version 0.25.0. The RLOOTrainer does "
+                "not use a data collator, so this parameter is ignored."
+            )
+        if "input_ids" in train_dataset.column_names:
+            warnings.warn(
+                "The training dataset contains a column named 'input_ids', indicating that it is pre-tokenized. "
+                "Support for pre-tokenized datasets is deprecated and will be removed in version 0.25. Please provide "
+                "the raw dataset (conversational or standard) with a 'prompt' column instead."
+            )
+
+            def decode(example, tokenizer):
+                return {"prompt": tokenizer.decode(example["input_ids"])}
+
+            train_dataset = train_dataset.map(decode, fn_kwargs={"tokenizer": processing_class})
+        if eval_dataset is not None and "input_ids" in eval_dataset.column_names:
+            warnings.warn(
+                "The evaluation dataset contains a column named 'input_ids', indicating that it is pre-tokenized. "
+                "Support for pre-tokenized datasets is deprecated and will be removed in version 0.25. Please provide "
+                "the raw dataset (conversational or standard) with a 'prompt' column instead."
+            )
+
+            def decode(example, tokenizer):
+                return {"prompt": tokenizer.decode(example["input_ids"])}
+
+            eval_dataset = eval_dataset.map(decode, fn_kwargs={"tokenizer": processing_class})
+
+        # Args
+        if args is None:
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = RLOOConfig(f"{model_name}-RLOO")
+
+        # Models
+        # Trained model
+        model_init_kwargs = args.model_init_kwargs or {}
+        if isinstance(model, str):
+            model_id = model
+            dtype = model_init_kwargs.get("dtype")
+            if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+                pass  # dtype is already a torch.dtype or "auto" or None
+            elif isinstance(dtype, str):  # it's a str, but not "auto"
+                dtype = getattr(torch, dtype)
+                model_init_kwargs["dtype"] = dtype
+            else:
+                raise ValueError(
+                    "Invalid `dtype` passed to `RLOOConfig`. Expected either 'auto' or a string representing "
+                    f"a `torch.dtype` (e.g., 'float32'), but got {dtype}."
+                )
+            # Disable caching if gradient checkpointing is enabled [not supported]
+            config = AutoConfig.from_pretrained(model_id)
+            architecture = getattr(transformers, config.architectures[0])
+            model = architecture.from_pretrained(model_id, **model_init_kwargs)
+        else:
+            model_id = model.config._name_or_path
+            if args.model_init_kwargs is not None:
+                logger.warning(
+                    "You passed `model_init_kwargs` to the `RLOOConfig`, but your model is already instantiated. "
+                    "The `model_init_kwargs` will be ignored."
+                )
+
+        # Some models [SmolVLM/Idefics3] don't support `logits_to_keep` argument and error out if we pass it
+        # Inspect the forward method before we wrap the model with PEFT
+        self.model_kwarg_keys = (
+            inspect.signature(model.forward).parameters.keys()
+            if not hasattr(model, "get_base_model")
+            else inspect.signature(model.get_base_model().forward).parameters.keys()
+        )
+
+        if False:
+            model = prepare_peft_model(model, peft_config, args)
+
+        # Processing class
+        if processing_class is None:
+            processing_class = AutoProcessor.from_pretrained(model.config._name_or_path)
+
+        # Handle pad token for processors or tokenizers
+        if isinstance(processing_class, ProcessorMixin):
+            tokenizer = processing_class.tokenizer
+        elif isinstance(processing_class, PreTrainedTokenizerBase):
+            tokenizer = processing_class
+        else:
+            raise TypeError("The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`")
+
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        self.pad_token = tokenizer.pad_token
+        self.pad_token_id = tokenizer.pad_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+
+        # Reward functions
+        if not isinstance(reward_funcs, list):
+            reward_funcs = [reward_funcs]
+        self.reward_func_names = []
+        for i, reward_func in enumerate(reward_funcs):
+            if isinstance(reward_func, str):
+                reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                    reward_func, num_labels=1, **model_init_kwargs
+                )
+            if isinstance(reward_funcs[i], nn.Module):  # Use Module over PretrainedModel for compat w/ compiled models
+                self.reward_func_names.append(reward_funcs[i].config._name_or_path.split("/")[-1])
+            else:
+                self.reward_func_names.append(reward_funcs[i].__name__)
+        self.reward_funcs = reward_funcs
+
+        # Reward weights
+        if args.reward_weights is not None:
+            if len(args.reward_weights) != len(reward_funcs):
+                raise ValueError(
+                    f"Number of reward weights ({len(args.reward_weights)}) must match number of reward "
+                    f"functions ({len(reward_funcs)})"
+                )
+            self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32)
+        else:
+            self.reward_weights = torch.ones(len(reward_funcs), dtype=torch.float32)
+
+        # Reward processing class
+        if reward_processing_classes is None:
+            reward_processing_classes = [None] * len(reward_funcs)
+        elif not isinstance(reward_processing_classes, list):
+            reward_processing_classes = [reward_processing_classes]
+        if len(reward_processing_classes) != len(reward_funcs):
+            raise ValueError(
+                f"The number of reward processing classes ({len(reward_processing_classes)}) must match the number of "
+                f"reward functions ({len(reward_funcs)})."
+            )
+
+        for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)):
+            if isinstance(reward_func, PreTrainedModel):
+                if reward_processing_class is None:
+                    reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path)
+                if reward_processing_class.pad_token_id is None:
+                    reward_processing_class.pad_token = reward_processing_class.eos_token
+                # The reward model computes the reward for the latest non-padded token in the input sequence.
+                # So it's important to set the pad token ID to the padding token ID of the processing class.
+                reward_func.config.pad_token_id = reward_processing_class.pad_token_id
+                reward_processing_classes[i] = reward_processing_class
+
+        self.reward_processing_classes = reward_processing_classes
+
+        # Training arguments
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = args.max_completion_length
+        self.num_generations = args.num_generations
+        self.temperature = args.temperature
+        self.top_p = args.top_p
+        self.top_k = args.top_k
+        self.min_p = args.min_p
+        self.repetition_penalty = args.repetition_penalty
+        self.use_transformers_paged = args.use_transformers_paged
+        self.use_vllm = args.use_vllm
+        self.vllm_mode = args.vllm_mode
+        self.vllm_gpu_memory_utilization = args.vllm_gpu_memory_utilization  # only applies to colocation mode
+        self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size  # only applies to colocation mode
+        self.normalize_advantages = args.normalize_advantages
+        self.mask_truncated_completions = args.mask_truncated_completions
+        self.reward_clip_range = args.reward_clip_range
+
+        # Datasets
+        self.shuffle_dataset = args.shuffle_dataset
+
+        if (
+            isinstance(train_dataset, IterableDataset)
+            or isinstance(eval_dataset, IterableDataset)
+            or (
+                isinstance(eval_dataset, dict) and any(isinstance(ds, IterableDataset) for ds in eval_dataset.values())
+            )
+        ):
+            # See https://github.com/huggingface/trl/issues/3213
+            raise NotImplementedError(
+                "Iterable datasets are not yet supported in RLOOTrainer. Please use a standard dataset instead."
+            )
+
+        # Multi-step
+        self.num_iterations = args.num_iterations
+        self.epsilon_low = args.epsilon
+        self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon
+        # Tracks the number of iterations [forward + backward passes], including those within a grad accum cycle
+        self._step = 0
+        # Buffer the batch to reuse generated outputs across multiple updates. For more details, see
+        # `_get_train_sampler` and `_prepare_inputs`.
+        self._buffered_inputs = None
+
+        # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in RLOO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning:
+        # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To
+        # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True.
+        # This acts as a flag to indicate that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=identity,  # No data collation is needed in RLOO
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
+        )
+
+        # Reference model
+        self.beta = args.beta
+        if self.beta == 0.0:
+            # If beta is 0.0, the reference model is not needed
+            self.ref_model = None
+        elif is_peft_model(model):
+            # If PEFT is used, the reference model is not needed since the adapter can be disabled
+            # to revert to the initial model.
+            self.ref_model = None
+        else:
+            # For deepspeed, fsdp or non-distributed models, create a reference model from scratch
+            config = AutoConfig.from_pretrained(model_id)
+            architecture = getattr(transformers, config.architectures[0])
+            self.ref_model = architecture.from_pretrained(model_id, **model_init_kwargs)
+
+        # Disable dropout in the models
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        # Initialize the metrics
+        self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)}
+        self._total_train_tokens = 0
+        self.log_completions = args.log_completions
+        self.wandb_log_unique_prompts = args.wandb_log_unique_prompts
+        self.num_completions_to_print = args.num_completions_to_print
+        # Keep logs sized to the generation batch to record only outputs from the latest model update.
+        self._logs = {
+            "prompt": deque(maxlen=args.generation_batch_size),
+            "completion": deque(maxlen=args.generation_batch_size),
+            "rewards": defaultdict(lambda: deque(maxlen=args.generation_batch_size)),
+            "advantages": deque(maxlen=args.generation_batch_size),
+        }
+
+        # Ensure each process receives a unique seed to prevent duplicate completions when generating with
+        # transformers if num_generations exceeds per_device_train_batch_size. We could skip it if we use vLLM, but
+        # it's safer to set it in all cases.
+        set_seed(args.seed, device_specific=True)
+
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError(
+                    "vLLM is not available and `use_vllm` is set to True. Please install vLLM with "
+                    "`pip install vllm` to use it."
+                )
+
+            if self.vllm_mode == "server":
+                if self.accelerator.is_main_process:
+                    if args.vllm_server_base_url is not None:
+                        base_url = args.vllm_server_base_url
+                    else:
+                        base_url = f"http://{args.vllm_server_host}:{args.vllm_server_port}"
+                    self.vllm_client = VLLMClient(base_url=base_url, connection_timeout=args.vllm_server_timeout)
+                    self.vllm_client.init_communicator(device=torch.cuda.current_device())
+
+            elif self.vllm_mode == "colocate":
+                if not self.accelerator.num_processes % self.vllm_tensor_parallel_size == 0:
+                    raise ValueError(
+                        f"vllm_tensor_parallel_size ({self.vllm_tensor_parallel_size}) must divide world size "
+                        f"({self.accelerator.num_processes}) evenly."
+                    )
+
+                if self.vllm_tensor_parallel_size > 1:
+                    self.tp_group, _ = torch.distributed.new_subgroups_by_enumeration(
+                        [
+                            list(range(i * self.vllm_tensor_parallel_size, (i + 1) * self.vllm_tensor_parallel_size))
+                            for i in range(self.accelerator.num_processes // self.vllm_tensor_parallel_size)
+                        ]
+                    )
+                os.environ["RANK"] = str(self.accelerator.process_index)
+                os.environ["LOCAL_RANK"] = str(self.accelerator.local_process_index)
+                os.environ["WORLD_SIZE"] = str(self.accelerator.num_processes)
+                os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "localhost")
+                os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "12345")
+
+                if self.max_prompt_length is not None and self.max_completion_length is not None:
+                    max_model_len = self.max_prompt_length + self.max_completion_length
+                else:
+                    max_model_len = None
+                self.llm = model.vllm_engine
+            else:
+                raise ValueError(f"vllm_mode must be either 'server' or 'colocate', got '{self.vllm_mode}'.")
+            self.guided_decoding_regex = args.vllm_guided_decoding_regex
+
+            self._last_loaded_step = -1
+            self.accelerator.wait_for_everyone()
+        else:
+            generation_kwargs = {
+                "max_new_tokens": self.max_completion_length,
+                "do_sample": True,
+                "pad_token_id": tokenizer.pad_token_id,
+                "bos_token_id": tokenizer.bos_token_id,
+                "eos_token_id": tokenizer.eos_token_id,
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "top_k": self.top_k,
+                "min_p": self.min_p,
+                "repetition_penalty": self.repetition_penalty,
+                "cache_implementation": args.cache_implementation,
+            }
+            if args.use_transformers_paged:
+                generation_kwargs["max_batch_tokens"] = 512
+                generation_kwargs["num_blocks"] = 1024
+                generation_kwargs["block_size"] = 128
+            if args.generation_kwargs is not None:
+                generation_kwargs.update(args.generation_kwargs)
+            self.generation_config = GenerationConfig(**generation_kwargs)
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags to the model
+        self.model.add_model_tags(self._tag_names)
+
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            elif self.is_fsdp_enabled:
+                self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+
+        if args.sync_ref_model:
+            self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator))
+
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                if self.is_deepspeed_enabled:
+                    self.reward_funcs[i] = prepare_deepspeed(reward_func, self.accelerator)
+                else:
+                    # set device placement to True to make `prepare_model` move `reward_func` to device when using fsdp
+                    self.reward_funcs[i] = self.accelerator.prepare_model(
+                        reward_func, evaluation_mode=True, device_placement=True
+                    )
+
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In RLOOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt"]
+
+    # This method overrides `Trainer.get_train_dataloader` to support our custom batching strategy.
+    # Instead of returning a standard per-step batch (i.e., `per_device_batch_size), our dataloader loads an
+    # *generation* batch (i.e., `per_device_batch_size × steps_per_generation`). This allows us to generate completions
+    # once every steps_per_generation step—rather than once per accumulation step—which is significantly more
+    # efficient. The only change from the original implementation is multiplying the batch size by
+    # `steps_per_generation`. Thus, `_prepare_inputs` is called with this *generation* batch, and it handles the
+    # splitting internally.
+    # Maintenance note: This method is a copy-paste of the original `Trainer.get_train_dataloader` with only one line
+    # modification. As a result, some parts of the method aren't relevant to RLOO, but we keep them to stay one line
+    # apart from the super method, ensuring easier maintenance in the future.
+    def get_train_dataloader(self):
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator
+        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
+            train_dataset = self._remove_unused_columns(train_dataset, description="training")
+        else:
+            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
+
+        dataloader_params = {
+            "batch_size": self._train_batch_size * self.args.steps_per_generation,  # < this is the change
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_train_sampler()
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = partial(
+                seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index
+            )
+
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+
+        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
+
+    def _get_train_sampler(self, dataset: Optional[Dataset] = None) -> Sampler:
+        # Returns a sampler that
+        # 1. ensures each prompt is repeated across multiple processes. This guarantees that identical prompts are
+        #    distributed to different GPUs, allowing rewards to be computed and normalized correctly within each prompt
+        #    group. Using the same seed across processes ensures consistent prompt assignment, preventing discrepancies
+        #    in group formation.
+        # 2. repeats the batch multiple times to allow reusing generations across multiple updates. Refer to
+        #    _prepare_inputs to see how the generations are stored and reused.
+
+        # In the following figure, the values are the prompt indices. The first row shows the first sampled batch, the
+        # second row shows the second sampled batch, and so on.
+        #
+        #                                      |   GPU 0  |   GPU 1  |
+        #
+        #                 global_step   step    <-───>  num_generations=2
+        #                                       <-───────> per_device_train_batch_size=3
+        #  grad_accum    ▲  ▲  0          0     0   0   1   1   2   2   <- Generate for the first `steps_per_generation` (prompts 0 to 11); store the completions; use the first slice to compute the loss
+        #     =2         ▼  |  0          1     3   3   4   4   5   5   <- Take the stored generations and use the second slice to compute the loss
+        #                   |
+        #                   |  1          2     6   6   7   7   8   8   <- Take the stored generations and use the third slice to compute the loss
+        #  steps_per_gen=4  ▼  1          3     9   9  10  10  11  11   <- Take the stored generations and use the fourth slice to compute the loss
+        #
+        #                      2          4    12  12  13  13  14  14   <- Generate for the second `steps_per_generation` (prompts 12 to 23); store the completions; use the first slice to compute the loss
+        #                      2          5    15  15  16  16  17  17   <- Take the stored generations and use the second slice to compute the loss
+        #                                          ...
+        if dataset is None:
+            dataset = self.train_dataset
+        return RepeatSampler(
+            data_source=dataset,
+            mini_repeat_count=self.num_generations,
+            batch_size=self.args.generation_batch_size // self.num_generations,
+            repeat_count=self.num_iterations * self.args.steps_per_generation,
+            shuffle=self.shuffle_dataset,
+            seed=self.args.seed,
+        )
+
+    def _get_eval_sampler(self, eval_dataset) -> Sampler:
+        # See _get_train_sampler for an explanation of the sampler.
+        return RepeatSampler(
+            data_source=eval_dataset,
+            mini_repeat_count=self.num_generations,
+            seed=self.args.seed,
+        )
+
+    @profiling_decorator
+    def _get_per_token_logps_and_entropies(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        logits_to_keep,
+        batch_size=None,
+        compute_entropy=False,
+    ) -> dict[str, Optional[torch.Tensor]]:
+        """Compute log-probs and (optionally) entropies for each token."""
+        batch_size = batch_size or input_ids.size(0)  # Chunk inputs into smaller batches to reduce memory peak
+        all_logps = []
+        all_entropies = []
+        for start in range(0, input_ids.size(0), batch_size):
+            input_ids_batch = input_ids[start : start + batch_size]
+            attention_mask_batch = attention_mask[start : start + batch_size]
+
+            # Build model inputs - check if the model supports logits_to_keep (some models and VLMs don't)
+            model_inputs = {"input_ids": input_ids_batch, "attention_mask": attention_mask_batch}
+
+            # Only add logits_to_keep if the model supports it
+            if "logits_to_keep" in self.model_kwarg_keys:
+                # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
+                model_inputs["logits_to_keep"] = logits_to_keep + 1
+
+            model_inputs["use_cache"] = False  # only used in generation; set False to suppress warnings
+
+            logits = model(**model_inputs).logits
+            # Exclude the last value: it corresponds to the next token pred
+            logits = logits[:, :-1, :]  # (B, L-1, H)
+            # Only keep the last logits_to_keep. For model that support logits_to_keep, this is a no-op.
+            logits = logits[:, -logits_to_keep:, :]  # (B, logits_to_keep, H)
+            # Divide logits by sampling temperature.
+            # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+            logits = logits / self.temperature
+
+            completion_ids = input_ids_batch[:, -logits_to_keep:]
+            logps = selective_log_softmax(logits, completion_ids)  # compute logprobs
+            all_logps.append(logps)
+
+            if compute_entropy:
+                with torch.no_grad():
+                    entropies = entropy_from_logits(logits)
+                all_entropies.append(entropies)
+
+        logps = torch.cat(all_logps, dim=0)
+        entropies = torch.cat(all_entropies, dim=0) if compute_entropy else None
+        return logps, entropies
+
+    def _fix_param_name_to_vllm(self, name, extra_prefixes: Optional[list[str]] = None):
+        extra_prefixes = extra_prefixes or []
+        prefixes = ["_checkpoint_wrapped_module."] + extra_prefixes
+        for prefix in prefixes:
+            name = name.replace(prefix, "")
+        return name
+
+    def _sync_fsdp1_params_to_vllm(self, module: nn.Module, prefix: str = "", visited=None):
+        """Memory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM."""
+        # For FSDP1, we need to recurse into children and also use summon_full_params
+        if visited is None:
+            visited = set()
+        for child_name, child_module in module.named_children():
+            child_prefix = f"{prefix}.{child_name}" if prefix else child_name
+            self._sync_fsdp1_params_to_vllm(
+                child_module, prefix=child_prefix, visited=visited
+            )  # recurse into the child
+
+        if isinstance(module, FSDP):
+            with FSDP.summon_full_params(module, recurse=False, writeback=False):
+                for param_name, param in module.named_parameters():
+                    full_name = f"{prefix}.{param_name}" if prefix else param_name
+                    full_name = self._fix_param_name_to_vllm(full_name, extra_prefixes=["_fsdp_wrapped_module."])
+
+                    if full_name in visited:
+                        continue  # skip FSDP subtrees already traversed
+                    visited.add(full_name)
+
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(full_name, param.data)
+                    elif self.vllm_mode == "colocate":
+
+                        pass
+
+                        pass
+
+    def _sync_fsdp2_params_to_vllm(self, module: nn.Module):
+        # For FSDP2, module already covers all parameters, so no need for recursion
+        for name, param in module.items():
+            if param.is_cpu:
+                param = param.to(torch.device("cuda"))
+            param = param.full_tensor()
+
+            if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                self.vllm_client.update_named_param(name, param)
+            elif self.vllm_mode == "colocate":
+
+                pass
+
+                pass
+
+    @profiling_decorator
+    def _move_model_to_vllm(self):
+        # For DeepSpeed ZeRO-3 and FSDP, we need to gather all parameters before operations
+        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
+        zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
+        if zero_stage_3:
+            import deepspeed
+
+            gather_if_zero3 = deepspeed.zero.GatheredParameters
+        else:
+            gather_if_zero3 = nullcontext
+
+        if is_peft_model(self.model):
+            # With PEFT and FSDP/DeepSpeed ZeRO Stage 3, we must gather the full model at once before merging, as
+            # merging adapters in a sharded manner is not supported.
+            # TODO: does this work with FSDP?
+            with gather_if_zero3(list(self.model.parameters())):
+                self.model.merge_adapter()
+
+                # Update vLLM weights while parameters are gathered
+                if self.is_fsdp_enabled:  # note if using FSDP, gather_if_zero3 is nullcontext
+                    # Update vLLM weights while parameters are gathered
+                    # For PEFT with FSDP we need to use the memory efficient post-order traversal
+                    fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None)
+                    fsdp_version = getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1
+                    if fsdp_version == 1:
+                        self._sync_fsdp1_params_to_vllm(
+                            self.model
+                        )  # use memory-efficient post-order traversal for FSDP
+                    elif fsdp_version == 2:
+                        self._sync_fsdp2_params_to_vllm(self.model)
+                else:
+                    # DeepSpeed ZeRO-3 with PEFT
+                    for name, param in self.model.named_parameters():
+                        # When using PEFT, we need to recover the original parameter name and discard some parameters
+                        name = name.removeprefix("base_model.model.").replace(".base_layer", "")
+                        if self.model.prefix in name:
+                            continue
+                        # When module to save, remove its prefix and discard the original module
+                        if "original_module" in name:
+                            continue
+                        name = self._fix_param_name_to_vllm(name, extra_prefixes=["modules_to_save.default."])
+
+                        if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                            self.vllm_client.update_named_param(name, param.data)
+                        elif self.vllm_mode == "colocate":
+
+                            pass
+
+                            pass
+                # Unmerge adapters while parameters are still gathered
+                self.model.unmerge_adapter()
+                # Parameters will automatically be repartitioned when exiting the context
+        else:
+            # For non-PEFT models, simply gather (if needed) and update each parameter individually.
+            if self.is_fsdp_enabled:
+                fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None)
+                fsdp_version = getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1
+                if fsdp_version == 1:
+                    self._sync_fsdp1_params_to_vllm(self.model)  # use memory-efficient post-order traversal for FSDP
+                elif fsdp_version == 2:
+                    self._sync_fsdp2_params_to_vllm(self.model)
+            else:
+                for name, param in self.model.named_parameters():
+                    name = self._fix_param_name_to_vllm(name)
+                    with gather_if_zero3([param]):
+                        if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                            self.vllm_client.update_named_param(name, param.data)
+                        elif self.vllm_mode == "colocate":
+
+                            pass
+
+                            pass
+
+        # Reset cache on vLLM
+        if self.vllm_mode == "server" and self.accelerator.is_main_process:
+            self.vllm_client.reset_prefix_cache()
+        elif self.vllm_mode == "colocate":
+            self.llm.reset_prefix_cache()
+
+    @profiling_decorator
+    def _prepare_inputs(
+        self, generation_batch: dict[str, Union[torch.Tensor, Any]]
+    ) -> dict[str, Union[torch.Tensor, Any]]:
+        # Prepares inputs for model training/evaluation by managing completion generation and batch handling.
+        # During training:
+        #   - Receives the local generation batch (Per-GPU batch size × steps per generation)
+        #     from the modified training dataloader instead of the standard local batch
+        #   - Generates completions once for the entire generation batch and splits it into batches of size
+        #     `per_device_train_batch_size`
+        #   - Buffers these completions and returns the appropriate slice for the current accumulation step
+        #   - Optimizes by regenerating completions only periodically (every steps_per_generation * num_iterations)
+        # During evaluation:
+        #   - The input is treated as a standard local batch (no accumulation, no multiple iterations)
+        #   - Completions are generated for each batch without buffering or reuse
+        # Returns a single local batch in both cases.
+
+        mode = "train" if self.model.training else "eval"
+        if mode == "train":
+            generate_every = self.args.steps_per_generation * self.num_iterations
+            if self._step % generate_every == 0 or self._buffered_inputs is None:
+                # self._buffered_inputs=None can occur when resuming from a checkpoint
+                generation_batch = self._generate_and_score_completions(generation_batch)
+                generation_batch = split_pixel_values_by_grid(generation_batch)
+
+                try: generation_batch = shuffle_sequence_dict(generation_batch)
+
+                except: pass
+                generation_batches = split_tensor_dict(generation_batch, self.args.steps_per_generation)
+                self._buffered_inputs = [unsplit_pixel_values_by_grid(batch) for batch in generation_batches]
+            inputs = self._buffered_inputs[self._step % self.args.steps_per_generation]
+            self._step += 1
+        else:
+            # In evaluation, there is neither batch grouping for generation, nor multiple iterations, hence
+            # local generation batch == local eval batch
+            inputs = self._generate_and_score_completions(generation_batch)
+        return inputs
+
+    @profiling_decorator
+    def _calculate_rewards(self, inputs, prompts, completions, completion_ids_list):
+        device = self.accelerator.device
+        rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device)
+
+        # Repeat all input columns (but "prompt", "completion", and "completion_ids") to match the num of generations
+        keys = [key for key in inputs[0] if key not in ["prompt", "completion", "completion_ids"]]
+        reward_kwargs = {key: [example[key] for example in inputs] for key in keys}
+
+        # This allows for dynamic reward shaping based on training progress.
+        reward_kwargs["trainer_state"] = self.state
+
+        for i, (reward_func, reward_processing_class, reward_func_name) in enumerate(
+            zip(self.reward_funcs, self.reward_processing_classes, self.reward_func_names)
+        ):
+            with profiling_context(self, reward_func_name):
+                if isinstance(reward_func, nn.Module):  # Module (no PretrainedModel) for compat with compiled models
+                    if is_conversational(inputs[0]):
+                        messages = [{"messages": p + c} for p, c in zip(prompts, completions)]
+                        texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages]
+                    else:
+                        texts = [p + c for p, c in zip(prompts, completions)]
+                    reward_inputs = reward_processing_class(
+                        text=texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False
+                    )
+                    reward_inputs = super()._prepare_inputs(reward_inputs)
+                    with torch.inference_mode():
+                        rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0]  # Shape (B*G,)
+                else:
+                    output_reward_func = reward_func(
+                        prompts=prompts, completions=completions, completion_ids=completion_ids_list, **reward_kwargs
+                    )
+                    # Convert None values to NaN
+                    output_reward_func = [reward if reward is not None else torch.nan for reward in output_reward_func]
+
+                    rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device)
+
+        # If all reward functions return None for a given row, issue a detailed warning
+        if torch.isnan(rewards_per_func).all(dim=1).any():
+            nan_row_idx = torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0]
+            row_reward_kwargs = {
+                key: value[nan_row_idx] for key, value in reward_kwargs.items() if key != "trainer_state"
+            }
+            row_reward_kwargs["prompt"] = prompts[nan_row_idx]
+            row_reward_kwargs["completion"] = completions[nan_row_idx]
+            logger.warning(
+                f"All reward functions returned None for the following kwargs:\n{row_reward_kwargs}\n"
+                "Please ensure that at least one reward function returns a valid reward."
+            )
+
+        # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
+        # completions may be distributed across processes
+        rewards_per_func = gather(rewards_per_func)
+        return rewards_per_func
+
+    def _generate_and_score_completions(
+        self, inputs: list[dict[str, Union[torch.Tensor, Any]]]
+    ) -> dict[str, Union[torch.Tensor, Any]]:
+        device = self.accelerator.device
+        mode = "train" if self.model.training else "eval"
+
+        prompts = [x["prompt"] for x in inputs]
+
+        # We don't yet support visual reward models/function, so we keep a copy of the original text-only prompts for
+        # later use in the reward computation. If images are present, we insert {"type": "image"} as required by the
+        # VLM chat template.
+        original_prompts = copy.deepcopy(prompts)
+
+        prompts_text = [maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs]
+
+        prompt_inputs = self.processing_class(
+            text=prompts_text,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left",
+            add_special_tokens=False,
+        )
+        prompt_inputs = super()._prepare_inputs(prompt_inputs)
+        prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"]
+
+        if self.max_prompt_length is not None:
+            # If max_prompt_length is set, we trim the prompt to keep only the last `max_prompt_length` tokens.
+            # Then we decode those tokens back into text. We manually remove leading pad tokens from the decoded text,
+            # because we can't use `skip_special_tokens=True` (some special tokens are still needed for generation).
+            prompt_ids, prompt_mask = truncate_with_protected_tokens(
+                prompt_ids, prompt_mask, self.max_prompt_length, protected_tokens=[]
+            )
+
+            prompts_text = self.processing_class.batch_decode(
+                prompt_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
+            )
+            prompts_text = [re.sub(rf"^({re.escape(self.pad_token)})+", "", text) for text in prompts_text]
+
+        # Generate completions using either vLLM or regular generation
+        if self.use_vllm:
+            # First, update the vLLM weights if needed
+            if self.state.global_step != self._last_loaded_step:
+                self._move_model_to_vllm()
+                self._last_loaded_step = self.state.global_step
+
+            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
+            if self.vllm_mode == "server":
+                all_prompts_text = gather_object(prompts_text)
+
+                if self.accelerator.is_main_process:
+                    # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
+                    # num_generations outputs for each one. This is faster than generating outputs for each duplicate
+                    # prompt individually.
+                    ordered_set_of_prompts = all_prompts_text[:: self.num_generations]
+
+                    with profiling_context(self, "vLLM.generate"):
+                        completion_ids = self.vllm_client.generate(
+                            prompts=ordered_set_of_prompts,
+                            n=self.num_generations,
+                            repetition_penalty=self.repetition_penalty,
+                            temperature=self.temperature,
+                            top_p=self.top_p,
+                            top_k=-1 if self.top_k is None else self.top_k,
+                            min_p=0.0 if self.min_p is None else self.min_p,
+                            max_tokens=self.max_completion_length,
+                            guided_decoding_regex=self.guided_decoding_regex,
+                            generation_kwargs=self.args.generation_kwargs,
+                        )
+                else:
+                    completion_ids = [None] * len(all_prompts_text)
+                # Broadcast the completions from the main process to all processes, ensuring each process receives its
+                # corresponding slice.
+                completion_ids = broadcast_object_list(completion_ids, from_process=0)
+                process_slice = slice(
+                    self.accelerator.process_index * len(prompts),
+                    (self.accelerator.process_index + 1) * len(prompts),
+                )
+                completion_ids = completion_ids[process_slice]
+
+            # Generate completions using colocated vLLM instances: each device holds vLLM copy and work on their own batch of prompts
+            elif self.vllm_mode == "colocate":
+                if self.guided_decoding_regex:
+                    guided_decoding = GuidedDecodingParams(regex=self.guided_decoding_regex)
+                else:
+                    guided_decoding = None
+
+                generation_kwargs = {
+                    "n": 1,  # vLLM on each GPU generates only 1 in colocate mode
+                    "repetition_penalty": self.repetition_penalty,
+                    "temperature": self.temperature,
+                    "top_p": self.top_p,
+                    "top_k": -1 if self.top_k is None else self.top_k,
+                    "min_p": 0.0 if self.min_p is None else self.min_p,
+                    "max_tokens": self.max_completion_length,
+                    "guided_decoding": guided_decoding,
+                }
+                if self.args.generation_kwargs is not None:
+                    generation_kwargs.update(self.args.generation_kwargs)
+                sampling_params = SamplingParams(**generation_kwargs)
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Gather prompts from all ranks in the TP group and flatten.
+                    # Each rank starts with its own prompts; after gathering, all ranks see the full group set.
+                    orig_size = len(prompts_text)
+                    gathered_prompts = [None for _ in range(self.vllm_tensor_parallel_size)]
+                    torch.distributed.all_gather_object(gathered_prompts, prompts_text, group=self.tp_group)
+                    all_prompts_text = [p for sublist in gathered_prompts for p in sublist]
+
+                else:
+                    all_prompts_text = prompts_text
+
+                vllm_inputs = all_prompts_text
+
+                with profiling_context(self, "vLLM.generate"):
+                    all_outputs = self.llm.generate(vllm_inputs, sampling_params=sampling_params, use_tqdm=False, lora_request = self.model.load_lora('rloo_trainer_lora_model', load_tensors = True))
+
+                completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs]
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Slice completions for this rank within its TP group.
+                    # Each rank generates all outputs — we keep only our share.
+                    local_rank_in_group = torch.distributed.get_rank(group=self.tp_group)
+                    tp_slice = slice(local_rank_in_group * orig_size, (local_rank_in_group + 1) * orig_size)
+                    completion_ids = completion_ids[tp_slice]
+
+            # Pad the completions, and concatenate them with the prompts
+            completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
+            completion_ids = pad(completion_ids, padding_value=self.pad_token_id)
+            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+
+        elif self.use_transformers_paged:
+            # Re-process inputs for paged generation if needed
+            paged_prompt_inputs = self.processing_class(text=prompts_text)
+            previous_attn = self.model_wrapped.config._attn_implementation
+
+            if is_flash_attn_2_available():
+                self.model_wrapped.config._attn_implementation = "paged_attention"
+            else:
+                self.model_wrapped.config._attn_implementation = "sdpa_paged"
+            with (
+                profiling_context(self, "transformers.generate_batch"),
+                unwrap_model_for_generation(
+                    self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False) if self.is_fsdp_enabled else nullcontext(),
+            ):
+                # Cast to the appropriate dtype based on training configuration
+                if self.args.bf16:
+                    unwrapped_model.to(torch.bfloat16)
+                elif self.args.fp16:
+                    unwrapped_model.to(torch.float16)
+                with torch.inference_mode():
+                    all_outputs = unwrapped_model.generate_batch(
+                        paged_prompt_inputs.input_ids, generation_config=self.generation_config, progress_bar=False
+                    )
+            completion_ids = [output.generated_tokens for output in all_outputs.values()]
+            completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
+            completion_ids = pad(completion_ids, padding_value=self.pad_token_id, padding_side="right")
+            prompt_ids = [torch.tensor(ids, device=device) for ids in paged_prompt_inputs.input_ids]
+            prompt_ids = pad(prompt_ids, padding_value=self.pad_token_id, padding_side="left")
+            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+            # Restore the original attention implementation, training mode
+            self.model_wrapped.config._attn_implementation = previous_attn
+        else:
+            # Regular generation path
+            with (
+                profiling_context(self, "transformers.generate"),
+                unwrap_model_for_generation(
+                    self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False) if self.is_fsdp_enabled else nullcontext(),
+            ):
+                prompt_inputs["input_ids"], prompt_inputs["attention_mask"] = prompt_ids, prompt_mask
+                prompt_completion_ids = unwrapped_model.generate(
+                    **prompt_inputs, generation_config=self.generation_config, disable_compile=True
+                )
+            # Compute prompt length and extract completion ids
+            prompt_length = prompt_ids.size(1)
+            prompt_ids = prompt_completion_ids[:, :prompt_length]
+            completion_ids = prompt_completion_ids[:, prompt_length:]
+
+        # Mask everything after the first EOS token
+        is_eos = completion_ids == self.eos_token_id
+        eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
+        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
+        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
+        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
+
+        # Convert tensor to a list of lists of token IDs. This will be passed to the reward function, avoiding the need
+        # to re-tokenize completions if the reward is computed from tokens.
+        completion_ids_list = [row[mask_row].tolist() for row, mask_row in zip(completion_ids, completion_mask.bool())]
+
+        # Sum along sequence dimension (dim=1) to get completion length per sequence, used for logging
+        completion_lengths = completion_mask.sum(1)
+
+        # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
+        if self.mask_truncated_completions:
+            truncated_completions = ~is_eos.any(dim=1)
+            completion_mask = completion_mask * (~truncated_completions).unsqueeze(1).int()
+
+        # Concatenate prompt_mask with completion_mask for logit computation
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
+
+        logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+        batch_size = self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_size
+
+        with torch.no_grad():
+            # Compute the per-token log probabilities for the current model
+            old_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                self.model,
+                prompt_completion_ids,
+                attention_mask,
+                logits_to_keep,
+                batch_size,
+            )
+            old_logps = (old_per_token_logps * completion_mask).sum(1)  # mask out padding and tokens after EOS
+
+            # Compute the per-token log probabilities for the reference model
+            if self.beta != 0.0:
+                if self.ref_model is not None:
+                    ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                        self.ref_model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        batch_size=batch_size,
+                    )
+                else:
+                    with self.accelerator.unwrap_model(self.model).disable_adapter():
+                        ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                            self.model,
+                            prompt_completion_ids,
+                            attention_mask,
+                            logits_to_keep,
+                            batch_size=batch_size,
+                        )
+            else:
+                ref_per_token_logps = None
+
+        # Decode the generated completions
+        completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True)
+        if is_conversational(inputs[0]):
+            completions = []
+            for prompt, completion in zip(prompts, completions_text):
+                bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                completions.append([{"role": "assistant", "content": bootstrap + completion}])
+        else:
+            completions = completions_text
+
+        # Calculate rewards for each reward function. rewards_per_func aggregates rewards across all processes. This is
+        # important because rewards will be normalized per group, and completions are distributed. We will later slice
+        # rewards_per_func to extract each process's subset.
+        rewards_per_func = self._calculate_rewards(inputs, original_prompts, completions, completion_ids_list)
+
+        # Apply weights to each reward function's output and sum
+        rewards = (rewards_per_func * self.reward_weights.to(device).unsqueeze(0)).nansum(dim=1)
+
+        # Apply reward clipping if specified
+        if self.reward_clip_range:
+            rewards = rewards.clamp(min=self.reward_clip_range[0], max=self.reward_clip_range[1])
+
+        # Include the KL penalty in the reward
+        if self.beta != 0.0:
+            per_token_kl = old_per_token_logps - ref_per_token_logps
+            # Apply sequence-level KL penalty to rewards (sum KL across tokens first, then apply to each sequence)
+            kl = (per_token_kl * completion_mask).sum(-1)
+            kl = gather(kl)  # rewards are gathered, so kl must be too
+            rewards = rewards - self.beta * kl
+
+        grouped_rewards = rewards.view(-1, self.num_generations)
+        mean_grouped_rewards = grouped_rewards.mean(dim=1)
+        std_rewards = grouped_rewards.std(dim=1)
+        is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards))
+
+        # RLOO advantages computation
+        grouped_sum = grouped_rewards.sum(dim=1, keepdim=True)  # (num_prompts, 1)
+        baselines = (grouped_sum - grouped_rewards) / (self.num_generations - 1)  # (num_prompts, num_generations)
+        baselines = baselines.view(-1)  # Flatten back to match rewards shape
+        advantages = rewards - baselines
+
+        # Normalize advantages
+        if self.normalize_advantages:
+            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-4)
+
+        # Slice to keep only the local part of the data
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts),
+            (self.accelerator.process_index + 1) * len(prompts),
+        )
+        all_process_advantages = advantages.clone()  # keep the aggregated advantages for logging
+        advantages = advantages[process_slice]
+
+        # Log the metrics
+        if mode == "train":
+            self.state.num_input_tokens_seen += self.accelerator.gather(attention_mask.sum()).sum().item()
+        self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen]
+
+        # Calculate and log the mean KL divergence between current and reference model
+        if self.beta != 0.0:
+            mean_kl = (per_token_kl * completion_mask).sum() / completion_mask.sum().clamp(min=1.0)
+            self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item())
+
+        # Log completion lengths, mean, min, max
+        agg_completion_lengths = self.accelerator.gather(completion_lengths)
+        self._metrics[mode]["completions/mean_length"].append(agg_completion_lengths.float().mean().item())
+        self._metrics[mode]["completions/min_length"].append(agg_completion_lengths.float().min().item())
+        self._metrics[mode]["completions/max_length"].append(agg_completion_lengths.float().max().item())
+
+        # Identify sequences that terminated with EOS and log their lengths
+        agg_terminated_with_eos = self.accelerator.gather(is_eos.any(dim=1))
+        term_completion_lengths = agg_completion_lengths[agg_terminated_with_eos]
+        clipped_completions_ratio = 1 - len(term_completion_lengths) / len(agg_completion_lengths)
+        self._metrics[mode]["completions/clipped_ratio"].append(clipped_completions_ratio)
+        if len(term_completion_lengths) == 0:  # edge case where no terminated sequences are found
+            term_completion_lengths = torch.zeros(1, device=device)
+        self._metrics[mode]["completions/mean_terminated_length"].append(term_completion_lengths.float().mean().item())
+        self._metrics[mode]["completions/min_terminated_length"].append(term_completion_lengths.float().min().item())
+        self._metrics[mode]["completions/max_terminated_length"].append(term_completion_lengths.float().max().item())
+
+        # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
+        for i, reward_func_name in enumerate(self.reward_func_names):
+            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards)
+            std_func_rewards = nanstd(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/std"].append(std_func_rewards)
+        self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item())
+        self._metrics[mode]["reward_std"].append(std_rewards.mean().item())
+        self._metrics[mode]["frac_reward_zero_std"].append(is_std_zero.float().mean().item())
+
+        # Log prompt and completion texts
+        self._logs["prompt"].extend(gather_object(prompts_text))
+        self._logs["completion"].extend(gather_object(completions_text))
+        for i, name in enumerate(self.reward_func_names):
+            self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist())
+        self._logs["advantages"].extend(all_process_advantages.tolist())
+
+        output = {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "old_logps": old_logps,
+            "advantages": advantages,
+        }
+        return output
+
+    @profiling_decorator
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        if return_outputs:
+            raise ValueError("The RLOOTrainer does not support returning outputs")
+        return self._compute_loss(model, inputs)
+
+    def _compute_loss(self, model, inputs):
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+
+        # Compute the per_token_logps and the entropy at each position in the completion
+        per_token_logps, entropies = self._get_per_token_logps_and_entropies(
+            model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            compute_entropy=True,
+        )
+        logps = (per_token_logps * completion_mask).sum(1)  # mask out padding and tokens after EOS
+        old_logps = inputs["old_logps"]
+        log_ratio = logps - old_logps
+
+        # Compute the loss
+        advantages = inputs["advantages"]
+        coef_1 = torch.exp(log_ratio)
+        coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
+        per_sequence_loss1 = coef_1 * advantages
+        per_sequence_loss2 = coef_2 * advantages
+        per_sequence_loss = -torch.min(per_sequence_loss1, per_sequence_loss2)
+        loss = per_sequence_loss.mean()
+
+        # Log the metrics
+        mode = "train" if self.model.training else "eval"
+
+        # Entropy
+        mean_entropy = (entropies * completion_mask).sum() / completion_mask.sum().clamp(min=1.0)
+        self._metrics[mode]["entropy"].append(self.accelerator.gather(mean_entropy).nanmean().item())
+
+        # Compute the clipped probability ratios
+        is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages < 0)
+        is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (advantages > 0)
+        is_region_clipped = is_low_clipped | is_high_clipped
+        gathered_low_clip = self.accelerator.gather(is_low_clipped.float().mean())
+        self._metrics[mode]["clip_ratio/low_mean"].append(gathered_low_clip.nanmean().item())
+        self._metrics[mode]["clip_ratio/low_min"].append(nanmin(gathered_low_clip).item())
+        gathered_high_clip = self.accelerator.gather(is_high_clipped.float().mean())
+        self._metrics[mode]["clip_ratio/high_mean"].append(gathered_high_clip.nanmean().item())
+        self._metrics[mode]["clip_ratio/high_max"].append(nanmax(gathered_high_clip).item())
+        gathered_clip_ratio = self.accelerator.gather(is_region_clipped.float().mean())
+        self._metrics[mode]["clip_ratio/region_mean"].append(gathered_clip_ratio.nanmean().item())
+        return loss
+
+    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys: Optional[list[str]] = None):
+        inputs = self._prepare_inputs(inputs)
+        with torch.no_grad():
+            with self.compute_loss_context_manager():
+                loss = self.compute_loss(model, inputs)
+            loss = loss.mean().detach()
+        return loss, None, None
+
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        mode = "train" if self.model.training else "eval"
+        metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()}  # average the metrics
+
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if mode == "eval":
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+
+        logs = {**logs, **metrics}
+        super().log(logs, start_time)
+        self._metrics[mode].clear()
+
+        if self.accelerator.is_main_process and self.log_completions:
+            if is_rich_available():
+                print_prompt_completions_sample(
+                    self._logs["prompt"],
+                    self._logs["completion"],
+                    self._logs["rewards"],
+                    self._logs["advantages"],
+                    self.state.global_step,
+                    self.num_completions_to_print,
+                )
+
+            if self.args.report_to and "wandb" in self.args.report_to and wandb.run is not None:
+                import pandas as pd
+
+                table = {
+                    "step": [str(self.state.global_step)] * len(self._logs["prompt"]),
+                    "prompt": self._logs["prompt"],
+                    "completion": self._logs["completion"],
+                    **self._logs["rewards"],
+                    "advantage": self._logs["advantages"],
+                }
+
+                df = pd.DataFrame(table)
+                if self.wandb_log_unique_prompts:
+                    df = df.drop_duplicates(subset=["prompt"])
+                wandb.log({"completions": wandb.Table(dataframe=df)})
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent(
+            """\
+            @inproceedings{ahmadian2024back,
+                title        = {{Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs}},
+                author       = {Arash Ahmadian and Chris Cremer and Matthias Gall{\'{e}} and Marzieh Fadaee and Julia Kreutzer and Olivier Pietquin and Ahmet {\"{U}}st{\"{u}}n and Sara Hooker},
+                year         = 2024,
+                booktitle    = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), {ACL} 2024, Bangkok, Thailand, August 11-16, 2024},
+                pages        = {12248--12267},
+                publisher    = {Association for Computational Linguistics},
+                editor       = {Lun{-}Wei Ku and Andre Martins and Vivek Srikumar},
+            }
+            """
+        )
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="RLOO",
+            trainer_citation=citation,
+            paper_title="Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs",
+            paper_id="2402.14740",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothRLOOTrainer(_UnslothRLOOTrainer):
+    """
+    
+Trainer for the Reinforce Leave One Out (RLOO) method. This algorithm was initially proposed in the paper [Back to
+Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in LLMs]
+(https://huggingface.co/papers/2402.14740).
+
+Example:
+
+```python
+from datasets import load_dataset
+from trl import RLOOTrainer
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+def reward_func(completions, **kwargs):
+    # Dummy reward function that rewards completions with more unique letters.
+    return [float(len(set(completion))) for completion in completions]
+trainer = RLOOTrainer(
+    model="Qwen/Qwen2-0.5B-Instruct",
+    reward_funcs=reward_func,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+Args:
+    model (`Union[str, PreTrainedModel]`):
+        Model to be trained. Can be either:
+
+        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+          path to a *directory* containing model weights saved using
+          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+          `args.model_init_kwargs`.
+        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+    reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
+        Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+        functions with the prompts and completions and sum the rewards. Can be either:
+
+        - A single reward function, such as:
+            - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
+            path to a *directory* containing model weights saved using
+            [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+            using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
+            keyword arguments in `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
+            - A custom reward function: The function is provided with the prompts and the generated completions,
+              plus any additional columns in the dataset. It should return a list of rewards. Custom reward
+              functions can also return `None` when the reward is not applicable to those samples. This is useful
+              for multi-task training where different reward functions apply to different types of samples. When a
+              reward function returns `None` for a sample, that reward function is excluded from the reward
+              calculation for that sample. For more details, see [Using a custom reward
+              function](#using-a-custom-reward-function).
+
+              The trainer's state is also passed to the reward function. The trainer's state is an instance of
+              [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
+              reward function's signature.
+        - A list of reward functions, where each item can independently be any of the above types. Mixing different
+        types within the list (e.g., a string model ID and a custom reward function) is allowed.
+    args ([`RLOOConfig`], *optional*, defaults to `None`):
+        Configuration for this trainer. If `None`, a default configuration is used.
+    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+        Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
+        ignored. The format of the samples can be either:
+
+        - [Standard](dataset_formats#standard): Each sample contains plain text.
+        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+          and content).
+    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
+        Processing class used to process the data. The padding side must be set to "left". If `None`, the
+        processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
+        padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
+        `tokenizer.eos_token` will be used as the default.
+    reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
+        Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+        - A single processing class: Used when `reward_funcs` contains only one reward function.
+        - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+        If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
+        `None`, the tokenizer for the model is automatically loaded using
+        [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
+        functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
+        are ignored.
+    callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
+        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+        in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+        method.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+        A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+        model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+
+    """
+    def __init__(
+        self,
+        model = None,
+        reward_funcs = None,
+        args = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        reward_processing_classes = None,
+        callbacks = None,
+        peft_config = None,
+        config = None,
+        reward_model = None,
+        policy = None,
+        ref_policy = None,
+        data_collator = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothRLOOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('rloo_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            reward_funcs = reward_funcs,
+            args = args,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            reward_processing_classes = reward_processing_classes,
+            callbacks = callbacks,
+            peft_config = peft_config,
+            config = config,
+            reward_model = reward_model,
+            policy = policy,
+            ref_policy = ref_policy,
+            data_collator = data_collator,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothRewardTrainer.py b/unsloth_compiled_cache/UnslothRewardTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4aabeaf05bfef153ac5cb032efe1f78d8e4d304
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothRewardTrainer.py
@@ -0,0 +1,1066 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.reward_trainer import (Any, BaseImageProcessor, Callable, DataCollator, Dataset, EvalPrediction, FeatureExtractionMixin, FrozenInstanceError, Optional, PartialState, Path, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RewardConfig, RewardDataCollatorWithPadding, RewardTrainer, Trainer, TrainerCallback, Union, _tokenize, compute_accuracy, decode_and_strip_padding, defaultdict, disable_dropout_in_model, gather_object, generate_model_card, get_comet_experiment_url, is_rich_available, is_wandb_available, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, nested_detach, nn, os, pd, prepare_peft_model, print_rich_table, replace, torch, Optional, PreTrainedModel, Trainer, logger, os, torch)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothRewardConfig(RewardConfig):
+    """
+    
+Configuration class for the [`RewardTrainer`].
+
+This class includes only the parameters that are specific to Reward training. For a full list of training
+arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this
+class may differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    max_length (`int` or `None`, *optional*, defaults to `1024`):
+        Maximum length of the sequences (prompt + completion) in the batch, filters out entries that exceed the
+        limit. This argument is required if you want to use the default data collator.
+    disable_dropout (`bool`, *optional*, defaults to `True`):
+        Whether to disable dropout in the model.
+    dataset_num_proc (`int`, *optional*, defaults to `None`):
+        Number of processes to use for processing the dataset.
+    center_rewards_coefficient (`float`, *optional*, defaults to `None`):
+        Coefficient to incentivize the reward model to output mean-zero rewards (proposed by
+        https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.
+    remove_unused_columns (`bool`, *optional*, defaults to `False`):
+        Whether to remove the columns that are not used by the model's forward pass. Can be `True` only if the
+        dataset is pretokenized.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = False,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        max_length = 1024,
+        disable_dropout = True,
+        dataset_num_proc = None,
+        center_rewards_coefficient = None,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            max_length = max_length,
+            disable_dropout = disable_dropout,
+            dataset_num_proc = dataset_num_proc,
+            center_rewards_coefficient = center_rewards_coefficient,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothRewardTrainer(Trainer):
+    """
+    Trainer for custom reward.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`] or `torch.nn.Module`, *optional*):
+            Model to be trained, preferably an [`~transformers.AutoModelForSequenceClassification`].
+        args ([`RewardConfig`], *optional*):
+            Training arguments.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            The data collator to use for training. If None is specified, the default data collator
+            [`~trainer.utils.RewardDataCollatorWithPadding`] will be used which will pad the sequences to the maximum
+            length of the sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`], *optional*):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`], *optional*):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        model_init (`Callable[[], transformers.PreTrainedModel]`, *optional*):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional*, defaults to [`~trainer.utils.compute_accuracy`]):
+            Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
+            dictionary string to float.
+        callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+            Callbacks to use during training.
+        optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+            Tuple containing the optimizer and the learning rate scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
+            Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
+            return the logits to be used for metrics computation.
+        peft_config (`dict`, *optional*):
+            PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
+            wrapped with the specified PEFT adapter.
+    """
+
+    _tag_names = ["trl", "reward-trainer"]
+
+    def __init__(
+        self,
+        model: Optional[Union[PreTrainedModel, nn.Module]] = None,
+        args: Optional[RewardConfig] = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional[dict] = None,
+    ):
+        if False:
+            model = prepare_peft_model(model, peft_config, args)
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+
+        if compute_metrics is None:
+            compute_metrics = compute_accuracy
+
+        if data_collator is None:
+            if processing_class is None:
+                raise ValueError(
+                    "A processing_class must be specified when using the default RewardDataCollatorWithPadding"
+                )
+
+            max_length = args.max_length
+
+            data_collator = RewardDataCollatorWithPadding(processing_class)
+
+            if args.remove_unused_columns:
+                try:  # for bc before https://github.com/huggingface/transformers/pull/25435
+                    args.remove_unused_columns = False
+                except FrozenInstanceError:
+                    args = replace(args, remove_unused_columns=False)
+                # warn users
+                logger.warning(
+                    "When using RewardDataCollatorWithPadding, you should set `remove_unused_columns=False` in your RewardConfig"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_reward_data_collator = True
+        else:
+            self.use_reward_data_collator = False
+
+        # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in Reward, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "input_ids_chosen" and "input_ids_rejected". As a result,
+        # the trainer issues the warning: "Could not estimate the number of tokens of the input, floating-point
+        # operations will not be computed." To suppress this warning, we set the "estimate_tokens" key in the model's
+        # "warnings_issued" dictionary to True. This acts as a flag to indicate that the warning has already been
+        # issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        if "input_ids_chosen" not in train_dataset.column_names:
+            with PartialState().main_process_first():
+                fn_kwargs = {"tokenizer": processing_class}
+                train_dataset = train_dataset.map(maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class})
+                train_dataset = train_dataset.map(
+                    _tokenize,
+                    batched=True,
+                    fn_kwargs=fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                )
+                # This filter is important because otherwise you get samples that exceed the model's context length and
+                # get truncated => noisy signal the chosen/rejected label gets lost. The downside is that the
+                # user might get surprised if N samples are missing from training.
+                train_dataset = train_dataset.filter(
+                    lambda x: len(x["input_ids_chosen"]) <= max_length and len(x["input_ids_rejected"]) <= max_length,
+                    num_proc=args.dataset_num_proc,
+                )
+                if eval_dataset is not None:
+                    eval_dataset = eval_dataset.map(
+                        maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}
+                    )
+                    eval_dataset = eval_dataset.map(
+                        _tokenize,
+                        fn_kwargs=fn_kwargs,
+                        batched=True,
+                        num_proc=args.dataset_num_proc,
+                    )
+                    # This filter is important because otherwise you get samples that exceed the model's context length and
+                    # get truncated => noisy signal the chosen/rejected label gets lost. The downside is that the
+                    # user might get surprised if N samples are missing from training.
+                    eval_dataset = eval_dataset.filter(
+                        lambda x: len(x["input_ids_chosen"]) <= max_length
+                        and len(x["input_ids_rejected"]) <= max_length,
+                        num_proc=args.dataset_num_proc,
+                    )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+    def compute_loss(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
+        rewards_chosen = model(
+            input_ids=inputs["input_ids_chosen"],
+            attention_mask=inputs["attention_mask_chosen"],
+            return_dict=True,
+        )["logits"]
+        rewards_rejected = model(
+            input_ids=inputs["input_ids_rejected"],
+            attention_mask=inputs["attention_mask_rejected"],
+            return_dict=True,
+        )["logits"]
+        # calculate loss, optionally modulate with margin
+        if "margin" in inputs:
+            loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - inputs["margin"]).mean()
+        else:
+            loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean()
+
+        if self.args.center_rewards_coefficient is not None:
+            loss += self.args.center_rewards_coefficient * torch.mean((rewards_chosen + rewards_rejected) ** 2)
+
+        if return_outputs:
+            return loss, {
+                "rewards_chosen": rewards_chosen,
+                "rewards_rejected": rewards_rejected,
+            }
+        return loss
+
+    def prediction_step(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[list[str]] = None,
+    ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        inputs = self._prepare_inputs(inputs)
+        if ignore_keys is None:
+            if hasattr(self.model, "config"):
+                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        with torch.no_grad():
+            loss, logits_dict = self.compute_loss(model, inputs, return_outputs=True)
+
+        if prediction_loss_only:
+            return (loss, None, None)
+
+        loss = loss.detach()
+        logits = tuple(v for k, v in logits_dict.items() if k not in ignore_keys)
+        logits = nested_detach(logits)
+        # Stack accepted against rejected, mean over logits
+        # and softmax to get preferences between accepted and rejected to sum to 1
+        logits = torch.stack(logits).mean(dim=2).softmax(dim=0).T
+
+        labels = torch.zeros(logits.shape[0])
+        labels = self._prepare_inputs(labels)
+
+        return loss, logits, labels
+
+    def evaluate(self, *args, **kwargs):
+        num_print_samples = kwargs.pop("num_print_samples", 4)
+        self.visualize_samples(num_print_samples)
+        return super().evaluate(*args, **kwargs)
+
+    def visualize_samples(self, num_print_samples: int):
+        """
+        Visualize the reward model logits prediction
+
+        Args:
+            num_print_samples (`int`, defaults to `4`):
+                The number of samples to print. Set to `-1` to print all samples.
+        """
+        eval_dataloader = self.get_eval_dataloader()
+        table = defaultdict(list)
+        for _, inputs in enumerate(eval_dataloader):
+            _, logits, _ = self.prediction_step(self.model, inputs, prediction_loss_only=False)
+            chosen_text = decode_and_strip_padding(inputs["input_ids_chosen"], self.processing_class)
+            rejected_text = decode_and_strip_padding(inputs["input_ids_rejected"], self.processing_class)
+            table["chosen_text"].extend(gather_object(chosen_text))
+            table["rejected_text"].extend(gather_object(rejected_text))
+            table["logits"].extend(
+                gather_object([[round(inner_item, 4) for inner_item in item] for item in logits.tolist()])
+            )
+            if num_print_samples >= 0 and len(table["chosen_text"]) >= num_print_samples:
+                break
+        df = pd.DataFrame(table)
+        if self.accelerator.process_index == 0:
+            if is_rich_available():
+                print_rich_table(df[:num_print_samples])
+            if "wandb" in self.args.report_to:
+                import wandb
+
+                if wandb.run is not None:
+                    wandb.log({"completions": wandb.Table(dataframe=df)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="completions.csv",
+                    table=df,
+                )
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="Reward",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothRewardTrainer(_UnslothRewardTrainer):
+    """
+    
+Trainer for custom reward.
+
+Args:
+    model ([`~transformers.PreTrainedModel`] or `torch.nn.Module`, *optional*):
+        Model to be trained, preferably an [`~transformers.AutoModelForSequenceClassification`].
+    args ([`RewardConfig`], *optional*):
+        Training arguments.
+    data_collator ([`~transformers.DataCollator`], *optional*):
+        The data collator to use for training. If None is specified, the default data collator
+        [`~trainer.utils.RewardDataCollatorWithPadding`] will be used which will pad the sequences to the maximum
+        length of the sequences in the batch, given a dataset of paired sequences.
+    train_dataset ([`~datasets.Dataset`], *optional*):
+        The dataset to use for training.
+    eval_dataset ([`~datasets.Dataset`], *optional*):
+        The dataset to use for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+        Processing class used to process the data. If provided, will be used to automatically process the inputs
+        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+        reuse the fine-tuned model.
+    model_init (`Callable[[], transformers.PreTrainedModel]`, *optional*):
+        The model initializer to use for training. If None is specified, the default model initializer will be
+        used.
+    compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional*, defaults to [`~trainer.utils.compute_accuracy`]):
+        Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
+        dictionary string to float.
+    callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+        Callbacks to use during training.
+    optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+        Tuple containing the optimizer and the learning rate scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
+        Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
+        return the logits to be used for metrics computation.
+    peft_config (`dict`, *optional*):
+        PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
+        wrapped with the specified PEFT adapter.
+
+    """
+    def __init__(
+        self,
+        model = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        model_init = None,
+        compute_metrics = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        peft_config = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothRewardConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('reward_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            model_init = model_init,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            peft_config = peft_config,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothSFTTrainer.py b/unsloth_compiled_cache/UnslothSFTTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..25830fad7d842e32b02eafcd50c15b6a1efdeb23
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothSFTTrainer.py
@@ -0,0 +1,1553 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.sft_trainer import (Any, AutoConfig, AutoProcessor, Callable, DataCollator, DataCollatorForLanguageModeling, DataCollatorForVisionLanguageModeling, Dataset, EvalPrediction, IterableDataset, Optional, Path, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SFTConfig, SFTTrainer, Trainer, TrainerCallback, TrainingArguments, Union, clone_chat_template, contextlib, dataclass, defaultdict, dft_loss, generate_model_card, get_act_offloading_ctx_manager, get_comet_experiment_url, is_conversational, is_wandb_available, logger, logging, nn, os, pack_dataset, pad, prepare_peft_model, selective_log_softmax, torch, transformers, Callable, DataCollator, DataCollatorForLanguageModeling, Dataset, IterableDataset, Optional, Union, os, pack_dataset, pad, transformers, Optional, PreTrainedModel, Trainer, logger, os, torch, os)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothSFTConfig(SFTConfig):
+    """
+    
+Configuration class for the [`SFTTrainer`].
+
+This class includes only the parameters that are specific to SFT training. For a full list of training arguments,
+please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+differ from those in [`~transformers.TrainingArguments`].
+
+Using [`~transformers.HfArgumentParser`] we can turn this class into
+[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+command line.
+
+Parameters:
+    > Parameters that control the model
+
+    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
+        argument of the [`SFTTrainer`] is provided as a string. If you're training a MoE architecture and want to
+        include the load balancing/auxilliary loss as a part of the final loss, remember to set
+        `output_router_logits=True` in this dictionary.
+    chat_template_path (`str` or `None`, *optional*, defaults to `None`):
+        If specified, sets the model's chat template. This can either be the path to a tokenizer (local directory
+        or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, you must
+        ensure that any special tokens referenced in the template are added to the tokenizer and that the model's
+        embedding layer is resized accordingly.
+
+    > Parameters that control the data preprocessing
+
+    dataset_text_field (`str`, *optional*, defaults to `"text"`):
+        Name of the column that contains text data in the dataset.
+    dataset_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+        Dictionary of optional keyword arguments for the dataset preparation. The only supported key is
+        `skip_prepare_dataset`. When the model is a VLM, `skip_prepare_dataset` is automatically treated as `True`
+        regardless of the provided value, since preprocessing is done on the fly.
+    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        Number of processes to use for processing the dataset.
+    eos_token (`str` or `None`, *optional*, defaults to `None`):
+        Token used to indicate the end of a turn or sequence. If `None`, it defaults to
+        `processing_class.eos_token`.
+    pad_token (`int` or `None`, *optional*, defaults to `None`):
+        Token used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that is also `None`,
+        it falls back to `processing_class.eos_token`.
+    max_length (`int` or `None`, *optional*, defaults to `1024`):
+        Maximum length of the tokenized sequence. Sequences longer than `max_length` are truncated from the right.
+        If `None`, no truncation is applied. When packing is enabled, this value sets the sequence length.
+    packing (`bool`, *optional*, defaults to `False`):
+        Whether to group multiple sequences into fixed-length blocks to improve computational efficiency and reduce
+        padding. Uses `max_length` to define sequence length.
+    packing_strategy (`str`, *optional*, defaults to `"bfd"`):
+        Strategy for packing sequences. Can be either `"bfd"` (best-fit decreasing, default), or `"wrapped"`.
+    padding_free (`bool`, *optional*, defaults to `False`):
+        Whether to perform forward passes without padding by flattening all sequences in the batch into a single
+        continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only
+        supported with the FlashAttention 2 or 3, which can efficiently handle the flattened batch structure. When
+        packing is enabled with strategy `"bfd"`, padding-free is enabled, regardless of the value of this
+        parameter.
+    pad_to_multiple_of (`int` or `None`, *optional*, defaults to `None`):
+        If set, the sequences will be padded to a multiple of this value.
+    eval_packing (`bool` or `None`, *optional*, defaults to `None`):
+        Whether to pack the eval dataset. If `None`, uses the same value as `packing`.
+
+    > Parameters that control the training
+
+    completion_only_loss (`bool` or `None`, *optional*, defaults to `None`):
+        Whether to compute loss only on the completion part of the sequence. If set to `True`, loss is computed
+        only on the completion, which is supported only for [prompt-completion](#prompt-completion) datasets. If
+        `False`, loss is computed on the entire sequence. If `None` (default), the behavior depends on the dataset:
+        loss is computed on the completion for [prompt-completion](#prompt-completion) datasets, and on the full
+        sequence for [language modeling](#language-modeling) datasets.
+    assistant_only_loss (`bool`, *optional*, defaults to `False`):
+        Whether to compute loss only on the assistant part of the sequence. If set to `True`, loss is computed only
+        on the assistant responses, which is supported only for [conversational](#conversational) datasets. If
+        `False`, loss is computed on the entire sequence.
+    loss_type (`str`, *optional*, defaults to `"nll"`):
+        Type of loss to use. Possible values are `"nll"` (negative log-likelihood, default) and `"dft"` (Dynamic
+        Fine-Tuning, as described in [this paper](https://huggingface.co/papers/2508.05629)).
+    activation_offloading (`bool`, *optional*, defaults to `False`):
+        Whether to offload the activations to the CPU.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        model_init_kwargs = None,
+        chat_template_path = None,
+        dataset_text_field = 'text',
+        dataset_kwargs = None,
+        dataset_num_proc = None,
+        eos_token = None,
+        pad_token = None,
+        max_length = 1024,
+        packing = False,
+        packing_strategy = 'bfd',
+        padding_free = False,
+        pad_to_multiple_of = None,
+        eval_packing = None,
+        completion_only_loss = None,
+        assistant_only_loss = False,
+        loss_type = 'nll',
+        activation_offloading = False,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        if os.environ.get('UNSLOTH_ENABLE_FLEX_ATTENTION', '0') == '1':
+            from unsloth_zoo.flex_attention import HAS_FLEX_ATTENTION
+            if HAS_FLEX_ATTENTION and pad_to_multiple_of is None:
+                from unsloth_zoo.flex_attention import FLEX_ATTENTION_BLOCK_SIZE
+                pad_to_multiple_of = FLEX_ATTENTION_BLOCK_SIZE
+        
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            model_init_kwargs = model_init_kwargs,
+            chat_template_path = chat_template_path,
+            dataset_text_field = dataset_text_field,
+            dataset_kwargs = dataset_kwargs,
+            dataset_num_proc = dataset_num_proc,
+            eos_token = eos_token,
+            pad_token = pad_token,
+            max_length = max_length,
+            packing = packing,
+            packing_strategy = packing_strategy,
+            padding_free = padding_free,
+            pad_to_multiple_of = pad_to_multiple_of,
+            eval_packing = eval_packing,
+            completion_only_loss = completion_only_loss,
+            assistant_only_loss = assistant_only_loss,
+            loss_type = loss_type,
+            activation_offloading = activation_offloading,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothSFTTrainer(Trainer):
+    """
+    Trainer for Supervised Fine-Tuning (SFT) method.
+
+    This class is a wrapper around the [`~transformers.Trainer`] class and inherits all of its attributes and methods.
+
+    Example:
+
+    ```python
+    from datasets import load_dataset
+    from trl import SFTTrainer
+
+    dataset = load_dataset("roneneldan/TinyStories", split="train[:1%]")
+
+    trainer = SFTTrainer(model="Qwen/Qwen2-0.5B-Instruct", train_dataset=dataset)
+    trainer.train()
+    ```
+
+    Args:
+        model (`Union[str, PreTrainedModel]`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using `<ModelArchitecture>.from_pretrained` (where `<ModelArchitecture>` is derived from the model
+              config) with the keyword arguments in `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object.
+            If you're training a model with an MoE architecture and want to include the load balancing/auxilliary loss
+            as a part of the final loss, remember to set the `output_router_logits` config of the model to `True`.
+        args ([`SFTConfig`], *optional*, defaults to `None`):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        data_collator ([`~transformers.DataCollator`] or `None`, *optional*):
+            Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
+            Will default to [`~trainer.sft_trainer.DataCollatorForLanguageModeling`] if the model is a language model
+            and [`~trainer.sft_trainer.DataCollatorForVisionLanguageModeling`] if the model is a vision-language model.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. SFT supports both [language modeling](#language-modeling) type and
+            [prompt-completion](#prompt-completion) type. The format of the samples can be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+
+            The trainer also supports processed datasets (tokenized) as long as they contain an `input_ids` field.
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
+            Processing class used to process the data. If `None`, the processing class is loaded from the model's name
+            with [`~transformers.AutoProcessor.from_pretrained`]. A padding token, `tokenizer.pad_token`, must be set.
+            If the processing class has not set a padding token, `tokenizer.eos_token` will be used as the default.
+        compute_loss_func (`Callable` or `None`, *optional*, defaults to `None`):
+            A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated
+            batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss
+            function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618)
+            used by [`Trainer`].
+        compute_metrics (`Callable[[EvalPrediction], dict]` or `None`, *optional*, defaults to `None`):
+            The function that will be used to compute metrics at evaluation. Must take a
+            [`~transformers.EvalPrediction`] and return a dictionary string to metric values. When passing
+            [`SFTConfig`] with `batch_eval_metrics` set to `True`, your `compute_metrics` function must take a boolean
+            `compute_result` argument. This will be triggered after the last eval batch to signal that the function
+            needs to calculate and return the global summary statistics rather than accumulating the batch-level
+            statistics.
+        callbacks (list of [`~transformers.TrainerCallback`] or `None`, *optional*, defaults to `None`):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your
+            model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`.
+        optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*, defaults to `None`):
+            A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
+            `args`. Incompatible with the `optimizers` argument.
+
+            Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before
+            initializing the Trainer.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*, defaults to `None`):
+            A function that preprocess the logits right before caching them at each evaluation step. Must take two
+            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
+            by this function will be reflected in the predictions received by `compute_metrics`.
+
+            Note that the labels (second parameter) will be `None` if the dataset does not have them.
+        peft_config ([`~peft.PeftConfig`] or `None`, *optional*, defaults to `None`):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+        formatting_func (`Callable` or `None`, *optional*, defaults to `None`):
+            Formatting function applied to the dataset before tokenization. Applying the formatting function explicitly
+            converts the dataset into a [language modeling](#language-modeling) type.
+    """
+
+    _tag_names = ["trl", "sft"]
+
+    def __init__(
+        self,
+        model: Union[str, nn.Module, PreTrainedModel],
+        args: Optional[Union[SFTConfig, TrainingArguments]] = None,
+        data_collator: Optional[DataCollator] = None,  # type: ignore
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[Union[PreTrainedTokenizerBase, ProcessorMixin]] = None,
+        compute_loss_func: Optional[Callable] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
+        optimizer_cls_and_kwargs: Optional[tuple[type[torch.optim.Optimizer], dict[str, Any]]] = None,
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional["PeftConfig"] = None,
+        formatting_func: Optional[Callable[[dict], str]] = None,
+    ):
+        # Args
+        if args is None:
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = SFTConfig(f"{model_name}-SFT")
+        elif isinstance(args, TrainingArguments) and not isinstance(args, SFTConfig):
+            dict_args = args.to_dict()
+            dict_args["hub_token"] = args.hub_token  # to_dict hides the hub_token
+            dict_args.pop("push_to_hub_token")
+            args = SFTConfig(**dict_args)
+
+        # Model
+        model_init_kwargs = args.model_init_kwargs or {}
+        if isinstance(model, str):
+            model_id = model
+            dtype = model_init_kwargs.get("dtype")
+            if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+                pass  # dtype is already a torch.dtype or "auto" or None
+            elif isinstance(dtype, str) and dtype in ["bfloat16", "float16", "float32"]:
+                dtype = getattr(torch, dtype)
+                model_init_kwargs["dtype"] = dtype
+            else:
+                raise ValueError(
+                    "Invalid `dtype` passed to `SFTConfig`. Expected either 'auto' or a string representing "
+                    f"a valid `torch.dtype` (e.g., 'float32'), but got {dtype}."
+                )
+            config = AutoConfig.from_pretrained(model_id)
+            architecture = getattr(transformers, config.architectures[0])
+            model = architecture.from_pretrained(model_id, **model_init_kwargs)
+        else:
+            model_id = model.config._name_or_path
+            if args.model_init_kwargs is not None:
+                logger.warning(
+                    "You passed `model_init_kwargs` to the `SFTConfig`, but your model is already instantiated. "
+                    "The `model_init_kwargs` will be ignored."
+                )
+
+        # Processing class
+        if processing_class is None:
+            processing_class = AutoProcessor.from_pretrained(model_id)
+
+        # Handle pad token for processors or tokenizers
+        if isinstance(processing_class, ProcessorMixin):
+            tokenizer = processing_class.tokenizer
+            self._is_vlm = False
+        elif isinstance(processing_class, PreTrainedTokenizerBase):
+            tokenizer = processing_class
+            self._is_vlm = False
+        else:
+            raise TypeError("The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`")
+
+        if args.eos_token is not None:
+            eos_token = args.eos_token
+            eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)
+            if eos_token_id is None:
+                raise ValueError(
+                    f"The specified `eos_token` ('{eos_token}') is not found in the vocabulary of the given "
+                    f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `eos_token` exists "
+                    "in the vocabulary before using it as an EOS token."
+                )
+            tokenizer.eos_token_id = eos_token_id
+
+        if args.chat_template_path is not None:
+            if os.path.isfile(args.chat_template_path) and args.chat_template_path.endswith((".jinja", ".j2")):
+                with open(args.chat_template_path, encoding="utf-8") as chat_template_file:
+                    processing_class.chat_template = chat_template_file.read()
+                added_tokens = []
+            else:
+                model, processing_class, added_tokens = clone_chat_template(
+                    model, processing_class, args.chat_template_path
+                )
+        else:
+            added_tokens = []
+
+        # Catch some wrong configurations related to VLMs
+        if self._is_vlm and args.packing:
+            raise ValueError(
+                "Packing is not supported for vision-language models. Please set `packing=False` in the SFTConfig."
+            )
+        if self._is_vlm and args.padding_free:
+            raise ValueError(
+                "Padding-free training is yet not supported for vision-language models. Please set "
+                "`padding_free=False` in the `SFTConfig`."
+            )
+        if self._is_vlm and args.assistant_only_loss:
+            raise ValueError(
+                "Assistant-only loss is not yet supported for vision-language models. Please set "
+                "`assistant_only_loss=False` in the `SFTConfig`."
+            )
+
+        # PEFT configuration and model wrapping
+        if False:
+            if added_tokens:
+                # Ensure that the added tokens are trainable
+                if peft_config.trainable_token_indices is None:
+                    peft_config.trainable_token_indices = {"embed_tokens": added_tokens}
+                elif "embed_tokens" not in peft_config.trainable_token_indices:
+                    peft_config.trainable_token_indices["embed_tokens"] = added_tokens
+                else:
+                    peft_config.trainable_token_indices["embed_tokens"].extend(added_tokens)
+
+                # Ensure that the lm_head is trainable
+                if peft_config.modules_to_save is None or "lm_head" not in peft_config.modules_to_save:
+                    logger.warning(
+                        "Cloning chat template added new tokens to the tokenizer, but 'lm_head' is not in PEFT's "
+                        "`modules_to_save`. As a result, the model may not learn to generate outputs with these new "
+                        "tokens, leading to degraded generation quality. To fix this, add "
+                        "`modules_to_save=['lm_head']` to your PEFT configuration."
+                    )
+
+                    if peft_config.modules_to_save is None:
+                        peft_config.modules_to_save = ["lm_head"]
+                    else:
+                        peft_config.modules_to_save.append("lm_head")
+
+        # In Prompt Tuning a small set of trainable virtual tokens [continuous prompt embeddings] is prepended to the
+        # input. We store the number of these tokens so we can account for them correctly when calculating accuracy.
+        self.num_virtual_tokens = 0
+
+        if False:
+            model = prepare_peft_model(model, peft_config, args)
+            if model.active_adapter in model.peft_config:
+                peft_model_config = model.peft_config[model.active_adapter]
+                self.num_virtual_tokens = getattr(peft_model_config, "num_virtual_tokens", 0)
+
+        # Data collator
+        # BFD packing requires padding-free mode; otherwise, the collator outputs padded attention masks, causing
+        # FlashAttention to ignore position_ids and recompute them incorrectly from the padded attention mask.
+        self.padding_free = args.padding_free or (args.packing and args.packing_strategy == "bfd")
+        use_flash_attention = model.config._attn_implementation in [
+            "flash_attention_2",
+            "flash_attention_3",
+            "kernels-community/vllm-flash-attn3",
+        ]
+        if self.padding_free:
+            if data_collator is not None:
+                raise ValueError("Passing a custom data collator is not supported when using padding-free.")
+            if args.packing and args.packing_strategy == "wrapped":
+                logger.warning(
+                    "You are passing `padding_free=True` with the 'wrapped' packing strategy, which is not "
+                    "recommended. Please refer to the documentation to understand why this is not recommended."
+                )
+            if not use_flash_attention:
+                logger.warning(
+                    "Padding-free training is enabled, but the attention implementation is not set to "
+                    "'flash_attention_2'. Padding-free training flattens batches into a single sequence, and "
+                    "'flash_attention_2' is the only known attention mechanism that reliably supports this. Using "
+                    "other implementations may lead to unexpected behavior. To ensure compatibility, set "
+                    "`attn_implementation='flash_attention_2'` in the model configuration, or verify that your "
+                    "attention mechanism can handle flattened sequences."
+                )
+            if args.per_device_train_batch_size == 1 and not args.packing:
+                logger.warning(
+                    "You are using a per_device_train_batch_size of 1 with padding-free training. Using a batch size "
+                    "of 1 anihilate the benefits of padding-free training. Please consider increasing the batch size "
+                    "to at least 2."
+                )
+
+        # Decide whether to use completion-only loss: if not specified, then it is set to True if the dataset format
+        # is prompt-completion, and False if the dataset format is language modeling.
+        dataset_sample = next(iter(train_dataset))
+        if args.completion_only_loss is None:
+            self.completion_only_loss = "prompt" in dataset_sample and "completion" in dataset_sample
+        else:
+            self.completion_only_loss = args.completion_only_loss
+
+        if data_collator is None and not self._is_vlm:
+            # Get the pad token: if not provided, use the one from the processing class or the eos token
+            # if the processing class does not have a pad token.
+            pad_token = args.pad_token or tokenizer.pad_token or tokenizer.eos_token
+            pad_token_id = tokenizer.convert_tokens_to_ids(pad_token)
+            if pad_token_id is None:
+                raise ValueError(
+                    f"The specified `pad_token` ('{pad_token}') is not found in the vocabulary of the given "
+                    f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `pad_token` exists "
+                    "in the vocabulary before using it as a padding token."
+                )
+            data_collator = DataCollatorForLanguageModeling(
+                pad_token_id=pad_token_id,
+                completion_only_loss=self.completion_only_loss,
+                padding_free=self.padding_free,
+                # Using position_ids without flash_attn hurts the training
+                return_position_ids=use_flash_attention,
+                pad_to_multiple_of=args.pad_to_multiple_of,
+            )
+        elif data_collator is None and self._is_vlm:
+            data_collator = DataCollatorForVisionLanguageModeling(
+                processor=processing_class,
+                max_length=args.max_length,
+                completion_only_loss=self.completion_only_loss,
+                pad_to_multiple_of=args.pad_to_multiple_of,
+                dataset_text_field=args.dataset_text_field,
+            )
+
+        if args.packing and args.packing_strategy == "bfd" and not use_flash_attention:
+            logger.warning(
+                "You are using packing, but the attention implementation is not set to 'flash_attention_2' or "
+                "'kernels-community/vllm-flash-attn3'. Packing flattens batches into a single sequence, and Flash "
+                "Attention is the only known attention mechanisms that reliably support this. Using other "
+                "implementations may lead to cross-contamination between batches. To avoid this, either disable "
+                "packing by setting `packing=False`, or set `attn_implementation='flash_attention_2'` or "
+                "`attn_implementation='kernels-community/vllm-flash-attn3'` in the model configuration."
+            )
+        if args.assistant_only_loss and not is_conversational(dataset_sample):
+            raise ValueError(
+                "You set `assistant_only_loss=True`, but the dataset is not conversational. This option is only "
+                "supported for conversational datasets."
+            )
+
+        # Dataset
+        # Skip dataset preparation if `skip_prepare_dataset=True` in `dataset_kwargs`, or if it's a VLM, where
+        # preprocessing [e.g., image-to-pixel conversion] is too costly and done on the fly instead.
+        skip_prepare_dataset = (
+            args.dataset_kwargs is not None and args.dataset_kwargs.get("skip_prepare_dataset", False) or self._is_vlm
+        )
+        if not skip_prepare_dataset:
+            if self.completion_only_loss and formatting_func:
+                raise ValueError(
+                    "A formatting function was provided while `completion_only_loss=True`, which is incompatible. "
+                    "Using a formatter converts the dataset to a language modeling type, conflicting with "
+                    "completion-only loss. To resolve this, apply your formatting function before passing the "
+                    "dataset, or disable `completion_only_loss` in `SFTConfig`."
+                )
+            train_dataset = self._prepare_dataset(
+                train_dataset, processing_class, args, args.packing, formatting_func, "train"
+            )
+            if eval_dataset is not None:
+                packing = args.packing if args.eval_packing is None else args.eval_packing
+                if isinstance(eval_dataset, dict):
+                    eval_dataset = {
+                        key: self._prepare_dataset(dataset, processing_class, args, packing, formatting_func, key)
+                        for key, dataset in eval_dataset.items()
+                    }
+                else:
+                    eval_dataset = self._prepare_dataset(
+                        eval_dataset, processing_class, args, packing, formatting_func, "eval"
+                    )
+
+        # Loss function
+        if args.loss_type == "nll":
+            pass  # use the default loss
+        elif args.loss_type == "dft":
+            if compute_loss_func is not None:
+                raise ValueError(
+                    "You passed a `compute_loss_func` together with `loss_type='dft'` to the `SFTTrainer`. "
+                    "When using `loss_type='dft'`, the loss function is internally set to the DFT loss, so passing a "
+                    "`compute_loss_func` is not allowed."
+                )
+            compute_loss_func = dft_loss
+        else:
+            raise ValueError(f"Invalid `loss_type` {args.loss_type} passed. Supported values are 'nll' and 'dft'.")
+
+        # Initialize the metrics
+        self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)}
+        self._total_train_tokens = 0
+
+        # Initialize the Trainer. Parent class will handle:
+        # - DeepSpeed configuration [through create_accelerator_and_postprocess]
+        # - FSDP setup
+        # - Distributed training setup
+        # - Optimizer and scheduler creation
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_loss_func=compute_loss_func,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Initialize activation offloading context
+        if self.args.activation_offloading:
+            self.maybe_activation_offload_context = get_act_offloading_ctx_manager(model=self.model)
+        else:
+            self.maybe_activation_offload_context = contextlib.nullcontext()
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+    def _prepare_dataset(
+        self,
+        dataset: Union[Dataset, IterableDataset],
+        processing_class,
+        args,
+        packing: bool,
+        formatting_func: Optional[Callable[[dict], str]],
+        dataset_name: str,
+    ) -> Union[Dataset, IterableDataset]:
+        # All Unsloth Zoo code licensed under LGPLv3
+        try:
+            if isinstance(dataset, ConstantLengthDataset): return dataset
+        except:
+            pass
+    
+        map_kwargs = {}
+        use_desc = isinstance(dataset, Dataset)
+        is_vlm = hasattr(processing_class, "tokenizer")
+        tokenizer = processing_class
+        if is_vlm: tokenizer = processing_class.tokenizer
+    
+        # Get max length
+        max_seq_length = getattr(args, "max_length", 0)
+        if max_seq_length == 0: max_seq_length = getattr(args, "max_seq_length", 0)
+        if max_seq_length == 0: max_seq_length = getattr(self, "max_seq_length", 0)
+        if max_seq_length == 0: max_seq_length = getattr(self, "max_seq", 0)
+        if max_seq_length == 0: raise RuntimeError("Unsloth: max_seq_length is 0! Please specify one!")
+        dataset_text_field = getattr(args, "dataset_text_field", "text")
+        do_truncation = max_seq_length != 0
+        do_formatting_func = False
+        do_tokenize = True
+    
+        # Get correct column names
+        column_names = set(next(iter(dataset)).keys())
+        used_column_names = ["input_ids"]
+        if "attention_mask" in column_names:
+            used_column_names.append("attention_mask")
+    
+        # Check if already tokenized so skip
+        from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling
+        if "labels" in column_names:
+            # Most likely forgot data collator!
+            if is_vlm and not hasattr(tokenizer, "pad"):
+                # Check if processing_class has a .pad, if not, use tokenizer.tokenizer
+                raise RuntimeError(f"Unsloth: {processing_class.__class__} does not have .pad!")
+            self.data_collator = DataCollatorForSeq2Seq(tokenizer)
+            used_column_names.append("labels")
+            do_tokenize = False
+        elif "input_ids" in column_names:
+            # Skip dataset prep, and set data collator
+            if is_vlm and not hasattr(tokenizer, "pad"):
+                # Check if processing_class has a .pad, if not, use tokenizer.tokenizer
+                raise RuntimeError(f"Unsloth: {processing_class.__class__} does not have .pad!")
+            self.data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
+            do_tokenize = False
+        elif dataset_text_field not in column_names:
+            do_formatting_func = True
+            if formatting_func is None:
+                raise RuntimeError("Unsloth: You must specify a `formatting_func`")
+        pass
+    
+        if do_tokenize:
+            # Check double BOS tokens
+            if do_formatting_func:
+                test_text = formatting_func(next(iter(dataset)))
+                if not isinstance(test_text, list):
+                    raise ValueError(
+                        "Unsloth: The `formatting_func` should return a list of processed strings."
+                    )
+                test_text = test_text[0]
+            else:
+                test_text = next(iter(dataset))[dataset_text_field][0]
+    
+            # Get chat template
+            chat_template = getattr(processing_class, 'chat_template', '')
+            if chat_template == '' and is_vlm:
+                chat_template = getattr(tokenizer, 'chat_template', '')
+            if chat_template is None:
+                chat_template = ''
+    
+            # Get bos_token
+            add_special_tokens = True
+            bos_token_1 = getattr(processing_class, 'bos_token', None)
+            bos_token_2 = getattr(tokenizer, 'bos_token', None)
+            bos_token = bos_token_1 or bos_token_2
+    
+            if bos_token is not None:
+                if test_text.startswith(bos_token) or bos_token in chat_template:
+                    add_special_tokens = False
+                    print("Unsloth: We found double BOS tokens - we shall remove one automatically.")
+            pass
+    
+            # Create tokenize function
+            def _tokenize(example):
+                return tokenizer(
+                    example[dataset_text_field] if not do_formatting_func else formatting_func(example),
+                    truncation = do_truncation,
+                    max_length = max_seq_length,
+                    return_token_type_ids = False,
+                    add_special_tokens = add_special_tokens,
+                )
+            pass
+    
+            if not isinstance(dataset, IterableDataset):
+                dataset_num_proc = getattr(args, "dataset_num_proc", None)
+                if dataset_num_proc is None:
+                    from multiprocessing import cpu_count
+                    dataset_num_proc = max(cpu_count()+4, 2)
+                map_kwargs["num_proc"] = dataset_num_proc
+            else:
+                map_kwargs["batch_size"] = dataset._ex_iterable.batch_size
+                
+            if use_desc: map_kwargs["desc"] = f'Unsloth: Tokenizing ["{dataset_text_field}"]'
+            dataset = dataset.map(_tokenize, batched = True, **map_kwargs)
+    
+            # If VLM, switch data collator since .pad is needed!
+            if is_vlm and not hasattr(processing_class, "pad"):
+                data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
+                self.data_collator = data_collator
+            pass
+        pass
+        if packing:
+            # Try using new packing which works in TRL
+            try:
+                pack_dataset
+            except:
+                print("Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!")
+                return dataset
+    
+            if max_seq_length == 0:
+                raise ValueError("When packing is enabled, `max_seq_length` can't be `None`.")
+    
+            if use_desc: map_kwargs["desc"] = f"Unsloth: Packing {dataset_name} dataset"
+            dataset = pack_dataset(
+                dataset.select_columns(used_column_names),
+                max_seq_length,
+                getattr(args, "packing_strategy", "bfd"),
+                map_kwargs,
+            )
+        pass
+        return dataset
+    
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs (usually, "input_ids"
+        # and "attention_mask"). When using `train_on_completion_only` we add a "completion_mask" column to the
+        # dataset. So we need to override the default signature columns to include "completion_mask" as well.
+        if self._signature_columns is None:
+            if self._is_vlm:
+                self._signature_columns = ["messages", "prompt", "completion", "images"]
+            else:
+                self._signature_columns = ["input_ids", "labels", "seq_lengths", "completion_mask", "assistant_masks"]
+
+    def compute_loss(self, model, inputs, return_outputs = False, num_items_in_batch = None):
+        outputs = super().compute_loss(
+            model,
+            inputs,
+            return_outputs = return_outputs,
+            num_items_in_batch = num_items_in_batch,
+        )
+        return outputs
+
+    # Override training step to add activation offloading context.
+    def training_step(self, *args, **kwargs):
+        with self.maybe_activation_offload_context:
+            return super().training_step(*args, **kwargs)
+
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        mode = "train" if self.model.training else "eval"
+        metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()}  # average the metrics
+
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if mode == "eval":
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+
+        logs.update(metrics)
+        super().log(logs, start_time)
+        self._metrics[mode].clear()
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=list(tags),
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="SFT",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothSFTTrainer(_UnslothSFTTrainer):
+    """
+    
+Trainer for Supervised Fine-Tuning (SFT) method.
+
+This class is a wrapper around the [`~transformers.Trainer`] class and inherits all of its attributes and methods.
+
+Example:
+
+```python
+from datasets import load_dataset
+from trl import SFTTrainer
+
+dataset = load_dataset("roneneldan/TinyStories", split="train[:1%]")
+
+trainer = SFTTrainer(model="Qwen/Qwen2-0.5B-Instruct", train_dataset=dataset)
+trainer.train()
+```
+
+Args:
+    model (`Union[str, PreTrainedModel]`):
+        Model to be trained. Can be either:
+
+        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+          path to a *directory* containing model weights saved using
+          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+          using `<ModelArchitecture>.from_pretrained` (where `<ModelArchitecture>` is derived from the model
+          config) with the keyword arguments in `args.model_init_kwargs`.
+        - A [`~transformers.PreTrainedModel`] object.
+        If you're training a model with an MoE architecture and want to include the load balancing/auxilliary loss
+        as a part of the final loss, remember to set the `output_router_logits` config of the model to `True`.
+    args ([`SFTConfig`], *optional*, defaults to `None`):
+        Configuration for this trainer. If `None`, a default configuration is used.
+    data_collator ([`~transformers.DataCollator`] or `None`, *optional*):
+        Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
+        Will default to [`~trainer.sft_trainer.DataCollatorForLanguageModeling`] if the model is a language model
+        and [`~trainer.sft_trainer.DataCollatorForVisionLanguageModeling`] if the model is a vision-language model.
+    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+        Dataset to use for training. SFT supports both [language modeling](#language-modeling) type and
+        [prompt-completion](#prompt-completion) type. The format of the samples can be either:
+
+        - [Standard](dataset_formats#standard): Each sample contains plain text.
+        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+          and content).
+
+        The trainer also supports processed datasets (tokenized) as long as they contain an `input_ids` field.
+    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
+        Processing class used to process the data. If `None`, the processing class is loaded from the model's name
+        with [`~transformers.AutoProcessor.from_pretrained`]. A padding token, `tokenizer.pad_token`, must be set.
+        If the processing class has not set a padding token, `tokenizer.eos_token` will be used as the default.
+    compute_loss_func (`Callable` or `None`, *optional*, defaults to `None`):
+        A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated
+        batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss
+        function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618)
+        used by [`Trainer`].
+    compute_metrics (`Callable[[EvalPrediction], dict]` or `None`, *optional*, defaults to `None`):
+        The function that will be used to compute metrics at evaluation. Must take a
+        [`~transformers.EvalPrediction`] and return a dictionary string to metric values. When passing
+        [`SFTConfig`] with `batch_eval_metrics` set to `True`, your `compute_metrics` function must take a boolean
+        `compute_result` argument. This will be triggered after the last eval batch to signal that the function
+        needs to calculate and return the global summary statistics rather than accumulating the batch-level
+        statistics.
+    callbacks (list of [`~transformers.TrainerCallback`] or `None`, *optional*, defaults to `None`):
+        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+        in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+        method.
+    optimizers (`tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]]`, *optional*, defaults to `(None, None)`):
+        A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your
+        model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`.
+    optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*, defaults to `None`):
+        A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
+        `args`. Incompatible with the `optimizers` argument.
+
+        Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before
+        initializing the Trainer.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*, defaults to `None`):
+        A function that preprocess the logits right before caching them at each evaluation step. Must take two
+        tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
+        by this function will be reflected in the predictions received by `compute_metrics`.
+
+        Note that the labels (second parameter) will be `None` if the dataset does not have them.
+    peft_config ([`~peft.PeftConfig`] or `None`, *optional*, defaults to `None`):
+        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    formatting_func (`Callable` or `None`, *optional*, defaults to `None`):
+        Formatting function applied to the dataset before tokenization. Applying the formatting function explicitly
+        converts the dataset into a [language modeling](#language-modeling) type.
+
+    """
+    def __init__(
+        self,
+        model,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        compute_loss_func = None,
+        compute_metrics = None,
+        callbacks = None,
+        optimizer_cls_and_kwargs = None,
+        preprocess_logits_for_metrics = None,
+        peft_config = None,
+        formatting_func = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothSFTConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if 'max_length' not in locals() and not hasattr(args, 'max_length'):
+            pass
+        else:
+            if hasattr(args, 'max_seq_length') and args.max_seq_length is not None and args.max_seq_length > 0:
+                if hasattr(args, 'max_length'):
+                    args.max_length = args.max_seq_length
+                    max_length = args.max_length
+            else:
+                model_max_length = getattr(model, 'max_seq_length', None)
+                if model_max_length is None: model_max_length = getattr(model, 'max_length', None)
+                if model_max_length is not None:
+                    args.max_length = model_max_length
+                    max_length = args.max_length
+                elif hasattr(args, 'max_length') and args.max_length is not None:
+                    max_length = args.max_length
+                    # if we are here, then we are in a weird case where max_length is set but max_seq_length is not set
+                    setattr(model, 'max_seq_length', max_length)
+                else:
+                    print('Unsloth: We did not find `max_seq_length` or `max_length` in the model or args. We will set it to 1024.')
+                    args.max_length = 1024
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('sft_trainer', other_metrics)
+        IGNORED_TOKENIZER_NAMES = os.environ.get('UNSLOTH_IGNORED_TOKENIZER_NAMES', '').split('\n')
+        from unsloth_zoo.tokenizer_utils import fix_untrained_tokens
+        from unsloth_zoo.training_utils  import fix_zero_training_loss
+        if 'tokenizer' not in locals(): tokenizer = processing_class
+        fix_untrained_tokens(model, tokenizer, train_dataset, IGNORED_TOKENIZER_NAMES, eps = 1e-16)
+        fix_zero_training_loss(model, tokenizer, train_dataset)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            compute_loss_func = compute_loss_func,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            optimizer_cls_and_kwargs = optimizer_cls_and_kwargs,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            peft_config = peft_config,
+            formatting_func = formatting_func,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass
+
+
+if hasattr(logger, "addFilter"):
+    import logging
+    class HideLoggingMessage(logging.Filter):
+        def __init__(self, text): self.text = text
+        def filter(self, x): return not (self.text in x.getMessage())
+    pass
+    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+
diff --git a/unsloth_compiled_cache/UnslothXPOTrainer.py b/unsloth_compiled_cache/UnslothXPOTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d3c790edb26dd8619b46ffe2a978ee9227d0afe
--- /dev/null
+++ b/unsloth_compiled_cache/UnslothXPOTrainer.py
@@ -0,0 +1,1334 @@
+"""
+2025.11.2
+2025.11.1
+4.57.2
+0.23.0
+__UNSLOTH_VERSIONING__
+"""
+
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.xpo_trainer import (Any, BaseImageProcessor, BasePairwiseJudge, Callable, Dataset, EvalPrediction, F, FeatureExtractionMixin, IterableDataset, OnlineDPOTrainer, OptimizerNames, Optional, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SIMPLE_CHAT_TEMPLATE, TrainerCallback, Union, XPOConfig, XPOTrainer, empty_cache, generate_model_card, get_comet_experiment_url, get_reward, is_conversational, is_peft_available, is_wandb_available, jinja2, maybe_apply_chat_template, nn, os, selective_log_softmax, textwrap, torch, truncate_right, unwrap_model_for_generation)
+
+
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+
+# Wrap trainer with padding to right and enable training mode
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+
+    prompt_section = input_ids[:, :-logits_to_keep]
+
+    padding_mask = (prompt_section == pad_token_id)
+
+    pad_token_counts = padding_mask.sum(dim=1)
+
+    return pad_token_counts
+
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+
+    non_padding_mask = (completion_input_ids != pad_token_id)
+
+    final_mask = shift_mask & non_padding_mask
+
+    return final_mask
+
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+
+    return padded_logprobs
+@dataclass
+class UnslothXPOConfig(XPOConfig):
+    """
+    
+Configuration class for the [`XPOTrainer`].
+
+Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
+
+Parameters:
+    alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`):
+        Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch
+        and the last alpha is used for the rest of the epochs.
+
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        reward_model_path = None,
+        judge = None,
+        max_new_tokens = 64,
+        max_length = 512,
+        temperature = 0.9,
+        top_p = 1.0,
+        top_k = None,
+        min_p = None,
+        repetition_penalty = 1.0,
+        generation_kwargs = {},
+        use_transformers_paged = False,
+        cache_implementation = None,
+        missing_eos_penalty = None,
+        loss_type = 'sigmoid',
+        disable_dropout = True,
+        use_vllm = False,
+        vllm_model_impl = 'vllm',
+        vllm_guided_decoding_regex = None,
+        vllm_gpu_memory_utilization = 0.55,
+        vllm_mode = 'colocate',
+        vllm_server_base_url = None,
+        vllm_server_host = '0.0.0.0',
+        vllm_server_port = 8000,
+        vllm_server_timeout = 240.0,
+        vllm_tensor_parallel_size = 1,
+        ds3_gather_for_generation = True,
+        model_init_kwargs = None,
+        reward_weights = None,
+        dataset_num_proc = None,
+        gpu_memory_utilization = None,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        if temperature <= 0:
+            raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.')
+        elif temperature >= 10:
+            raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.')
+        
+        
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            reward_model_path = reward_model_path,
+            judge = judge,
+            max_new_tokens = max_new_tokens,
+            max_length = max_length,
+            temperature = temperature,
+            top_p = top_p,
+            top_k = top_k,
+            min_p = min_p,
+            repetition_penalty = repetition_penalty,
+            generation_kwargs = generation_kwargs,
+            use_transformers_paged = use_transformers_paged,
+            cache_implementation = cache_implementation,
+            missing_eos_penalty = missing_eos_penalty,
+            loss_type = loss_type,
+            disable_dropout = disable_dropout,
+            use_vllm = use_vllm,
+            vllm_model_impl = vllm_model_impl,
+            vllm_guided_decoding_regex = vllm_guided_decoding_regex,
+            vllm_gpu_memory_utilization = vllm_gpu_memory_utilization,
+            vllm_mode = vllm_mode,
+            vllm_server_base_url = vllm_server_base_url,
+            vllm_server_host = vllm_server_host,
+            vllm_server_port = vllm_server_port,
+            vllm_server_timeout = vllm_server_timeout,
+            vllm_tensor_parallel_size = vllm_tensor_parallel_size,
+            ds3_gather_for_generation = ds3_gather_for_generation,
+            model_init_kwargs = model_init_kwargs,
+            reward_weights = reward_weights,
+            dataset_num_proc = dataset_num_proc,
+            gpu_memory_utilization = gpu_memory_utilization,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+
+class _UnslothXPOTrainer(OnlineDPOTrainer):
+    r"""
+    Initialize XPOTrainer as a subclass of [`OnlineDPOConfig`].
+
+    Args:
+        model (`transformers.PreTrainedModel`):
+            The model to train, preferably an `AutoModelForCausalLM`.
+        ref_model (`PreTrainedModelWrapper`):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        reward_funcs (`transformers.PreTrainedModel`):
+            The reward model to score completions with, preferably an `AutoModelForSequenceClassification`.
+        judge (`BasePairwiseJudge`):
+            The judge to use for pairwise comparison of model completions.
+        args (`XPOConfig`):
+            The XPO config arguments to use for training.
+        data_collator (`transformers.DataCollator`):
+            The data collator to use for training. If None is specified, the default data collator
+            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset (`datasets.Dataset`):
+            The dataset to use for training.
+        eval_dataset (`datasets.Dataset`):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        peft_config (`dict`):
+            The peft config to use for training.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+
+    .. deprecated:: 0.22.0
+        The following parameters are deprecated and will be removed in a future version:
+
+        * `reward_model`: Use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.
+        * `reward_processing_class`: Use `reward_processing_classes` instead. For example, change
+          `reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.
+    """
+
+    _tag_names = ["trl", "xpo"]
+
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module] = None,
+        ref_model: Union[PreTrainedModel, nn.Module] = None,
+        reward_funcs: Optional[nn.Module] = None,
+        judge: Optional[BasePairwiseJudge] = None,
+        args: Optional[XPOConfig] = None,
+        data_collator: Optional[Callable] = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        # Deprecated parameters
+        reward_model: Optional[Union[PreTrainedModel, nn.Module]] = None,
+    ) -> None:
+        super().__init__(
+            model=model,
+            ref_model=ref_model,
+            judge=judge,
+            reward_funcs=reward_funcs,
+            reward_model=reward_model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=reward_processing_classes,
+            peft_config=peft_config,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        self._alpha = self.args.alpha
+
+        # Overwrite the stats dictionary to include XPO specific statistics
+        self.stats = {
+            # Remove "non_score_reward", "rlhf_reward", "scores"
+            # Add "loss/dpo", "loss/xpo"
+            "loss/dpo": [],
+            "loss/xpo": [],
+            "objective/kl": [],
+            "objective/entropy": [],
+            "rewards/chosen": [],
+            "rewards/rejected": [],
+            "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
+            # Replace "contain_eos_token" by "model_contain_eos_token" and "ref_contain_eos_token"
+            "val/model_contain_eos_token": [],
+            "val/ref_contain_eos_token": [],
+            "alpha": [],
+            "beta": [],
+        }
+        if self.reward_funcs is not None:
+            if len(self.reward_funcs) != 1:
+                raise ValueError("XPOTrainer only supports one reward function/model.")
+            self.reward_funcs = self.reward_funcs[0]
+            self.stats["objective/model_scores"] = []
+            self.stats["objective/ref_scores"] = []
+            self.stats["objective/scores_margin"] = []
+
+    @property
+    def alpha(self):
+        if isinstance(self._alpha, list):
+            epoch = self.state.epoch
+            return self._alpha[epoch] if epoch < len(self._alpha) else self._alpha[-1]
+        else:
+            return self._alpha
+
+    def _generate_completions(self, prompts, model):
+        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_policy_model_for_gen:
+            model_output = unwrapped_policy_model_for_gen.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        actual_model_for_ref_generation: torch.nn.Module
+        if self.ref_model is None:
+            unwrapped_main_model_for_ref_logic = self.accelerator.unwrap_model(model)
+
+            if is_peft_available() and isinstance(unwrapped_main_model_for_ref_logic, PeftModel):
+                actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic.get_base_model()
+            else:
+                actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic
+        else:
+            actual_model_for_ref_generation = self.accelerator.unwrap_model(self.ref_model)
+
+        with unwrap_model_for_generation(actual_model_for_ref_generation, self.accelerator) as final_ref_model_for_gen:
+            ref_output = final_ref_model_for_gen.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        return model_output, ref_output
+
+    def _process_completions(self, model_output, ref_output, prompts):
+        context_length = prompts["input_ids"].shape[1]
+
+        # Process model completions
+        model_completion_ids = model_output[:, context_length:]
+        model_completion_ids, model_completion_mask = truncate_right(
+            model_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        model_data = {
+            "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+
+        # Process reference model completions
+        ref_completion_ids = ref_output[:, context_length:]
+        ref_completion_ids, ref_completion_mask = truncate_right(
+            ref_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        ref_data = {
+            "input_ids": torch.cat((prompts["input_ids"], ref_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], ref_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+
+        return model_data, ref_data
+
+    def _compute_rewards(self, model_data, ref_data, context_length):
+        with torch.no_grad():
+            _, model_scores, _ = get_reward(
+                self.reward_funcs, model_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+            _, ref_scores, _ = get_reward(
+                self.reward_funcs, ref_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+
+        # Apply EOS penalty if needed
+        if self.args.missing_eos_penalty is not None:
+            model_contain_eos = torch.any(model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            ref_contain_eos = torch.any(ref_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            model_scores[~model_contain_eos] -= self.args.missing_eos_penalty
+            ref_scores[~ref_contain_eos] -= self.args.missing_eos_penalty
+
+        return model_scores, ref_scores
+
+    def _compute_judge(self, model_data, ref_data, context_length):
+        prompts = model_data["raw"]
+        model_data_completions = self.processing_class.batch_decode(
+            model_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        model_data_completions = [completion.strip() for completion in model_data_completions]
+
+        ref_data_completions = self.processing_class.batch_decode(
+            ref_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        ref_data_completions = [completion.strip() for completion in ref_data_completions]
+
+        if is_conversational({"prompt": prompts[0]}):
+            model_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in model_data_completions
+            ]
+            environment = jinja2.Environment()
+            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+            prompts = [template.render(messages=message) for message in prompts]
+            model_data_completions = [template.render(messages=completion) for completion in model_data_completions]
+
+            ref_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in ref_data_completions
+            ]
+            ref_data_completions = [template.render(messages=completion) for completion in ref_data_completions]
+
+        ranks_of_first_completion = self.judge.judge(
+            prompts,
+            list(zip(model_data_completions, ref_data_completions)),
+        )
+        # convert ranks to a True/False mask:
+        # when rank == 0, it means the first completion is the best
+        # when rank == 1, it means the second completion is the best
+        return torch.tensor([rank == 0 for rank in ranks_of_first_completion], device=model_data["input_ids"].device)
+
+    def _compute_logprobs(self, model, model_data, ref_data, context_length):
+        def compute_logprobs_for_data(m, data):
+            output = m(data["input_ids"], attention_mask=data["attention_mask"])
+            logits = output.logits[:, context_length - 1 : -1]
+            token_logprobs = selective_log_softmax(logits, data["input_ids"][:, context_length:])
+            return token_logprobs
+
+        # Compute logprobs for model completions
+        model_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+        # Compute logprobs for model on reference completions (for XPO loss)
+        model_logprobs_ref_data = compute_logprobs_for_data(model, ref_data)
+
+        # Compute logprobs for reference model completions
+        with torch.no_grad():
+            if self.ref_model is None:
+                with model.disable_adapter():
+                    ref_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+                    ref_logprobs_ref_data = compute_logprobs_for_data(model, ref_data)
+            else:
+                ref_logprobs_model_data = compute_logprobs_for_data(self.ref_model, model_data)
+                ref_logprobs_ref_data = compute_logprobs_for_data(self.ref_model, ref_data)
+
+        # Mask padding tokens
+        model_padding_mask = model_data["attention_mask"][:, context_length:] == 0
+        ref_padding_mask = ref_data["attention_mask"][:, context_length:] == 0
+        model_logprobs_model_data = model_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+        model_logprobs_ref_data = model_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0)
+        ref_logprobs_ref_data = ref_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0)
+        ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+
+        return model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data
+
+    def _compute_losses(
+        self,
+        model_logprobs_model_data,
+        model_logprobs_ref_data,
+        ref_logprobs_ref_data,
+        ref_logprobs_model_data,
+        chosen_mask,
+    ):
+        # Compute log probs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1)
+        ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+
+        chosen_model_logprobs = torch.where(chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        chosen_ref_logprobs = torch.where(chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs
+
+        rejected_model_logprobs = torch.where(~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        rejected_ref_logprobs = torch.where(~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs
+
+        # Compute logits as the difference between chosen and rejected log ratios
+        logits = chosen_log_ratios - rejected_log_ratios
+
+        if self.args.loss_type == "sigmoid":
+            dpo_losses = -F.logsigmoid(self.beta * logits)
+        elif self.args.loss_type == "ipo":
+            dpo_losses = (logits - 1 / (2 * self.beta)) ** 2
+        else:
+            raise NotImplementedError(f"invalid loss type {self.args.loss_type}")
+
+        # Compute XPO specific loss
+        xpo_losses = self.alpha * model_logprobs_ref_data_sum
+
+        # Total loss
+        loss = (dpo_losses + xpo_losses).mean()
+
+        return loss, dpo_losses, xpo_losses
+
+    def _log_statistics(
+        self,
+        model_data,
+        ref_data,
+        model_logprobs_model_data,
+        model_logprobs_ref_data,
+        ref_logprobs_ref_data,
+        ref_logprobs_model_data,
+        chosen_mask,
+        dpo_losses,
+        xpo_losses,
+        context_length,
+        model_scores=None,
+        ref_scores=None,
+    ):
+        # Helper function to gather and compute mean
+        def gather_mean(tensor):
+            return self.accelerator.gather_for_metrics(tensor).mean().item()
+
+        # Log losses
+        self.stats["loss/dpo"].append(gather_mean(dpo_losses))
+        self.stats["loss/xpo"].append(gather_mean(xpo_losses))
+
+        # Log scores
+        if self.reward_funcs is not None:
+            self.stats["objective/model_scores"].append(gather_mean(model_scores))
+            self.stats["objective/ref_scores"].append(gather_mean(ref_scores))
+            self.stats["objective/scores_margin"].append(gather_mean(model_scores - ref_scores))
+
+        # Log logprobs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1)
+        ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+
+        chosen_model_logprobs = torch.where(chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        chosen_ref_logprobs = torch.where(chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs
+
+        rejected_model_logprobs = torch.where(~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        rejected_ref_logprobs = torch.where(~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs
+
+        self.stats["logps/chosen"].append(gather_mean(chosen_model_logprobs.mean() + chosen_ref_logprobs.mean()))
+        self.stats["logps/rejected"].append(gather_mean(rejected_model_logprobs.mean() + rejected_ref_logprobs.mean()))
+
+        # Log rewards
+        # Compute various statistics
+        chosen_rewards = chosen_log_ratios * self.beta
+        rejected_rewards = rejected_log_ratios * self.beta
+        self.stats["rewards/chosen"].append(gather_mean(chosen_rewards.mean()))
+        self.stats["rewards/rejected"].append(gather_mean(rejected_rewards.mean()))
+
+        # Calculate KL divergence for model and ref data
+        kl_model_data = model_logprobs_model_data - ref_logprobs_model_data
+        kl_ref_data = model_logprobs_ref_data - ref_logprobs_ref_data
+        mean_kl = (kl_model_data.sum(1) + kl_ref_data.sum(1)).mean() / 2
+        self.stats["objective/kl"].append(gather_mean(mean_kl))
+
+        # Calculate entropy for model and ref data
+        entropy_model_data = -model_logprobs_model_data.sum(1)
+        entropy_ref_data = -model_logprobs_ref_data.sum(1)
+        mean_entropy = (entropy_model_data.mean() + entropy_ref_data.mean()) / 2
+        self.stats["objective/entropy"].append(gather_mean(mean_entropy))
+
+        # Calculate margins
+        margin = chosen_rewards - rejected_rewards
+        self.stats["rewards/margins"].append(gather_mean(margin.mean()))
+
+        # Calculate accuracy
+        accuracy = (margin > 0).float()
+        self.stats["rewards/accuracies"].append(gather_mean(accuracy.mean()))
+
+        # Log EOS token statistics
+        model_eos = (model_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        ref_eos = (ref_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float()))
+        self.stats["val/ref_contain_eos_token"].append(gather_mean(ref_eos.float()))
+
+        # Log alpha and beta
+        self.stats["alpha"].append(self.alpha)
+        self.stats["beta"].append(self.beta)
+
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+    ) -> torch.Tensor:
+        model.train()
+
+        # Apply chat template and tokenize the input
+        batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
+        inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
+        inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
+        inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
+        inputs = self.data_collator(inputs)
+
+        # need the prompt_ only
+        inputs = self._prepare_inputs(inputs)
+        context_length = inputs["prompt_input_ids"].shape[1]
+        prompts = {
+            "input_ids": inputs["prompt_input_ids"],
+            "attention_mask": inputs["prompt_attention_mask"],
+            "raw": prompts,
+        }
+        del inputs
+
+        # Sample completions from both the model and the reference model
+        model_output, ref_output = self._generate_completions(prompts, model)
+
+        # Process model completions
+        model_data, ref_data = self._process_completions(model_output, ref_output, prompts)
+
+        # Compute rewards
+        if self.reward_funcs is not None:
+            model_scores, ref_scores = self._compute_rewards(model_data, ref_data, context_length)
+            chosen_mask = model_scores >= ref_scores
+        else:
+            model_scores, ref_scores = None, None
+            chosen_mask = self._compute_judge(model_data, ref_data, context_length)
+
+        # Compute logprobs
+        model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data = (
+            self._compute_logprobs(model, model_data, ref_data, context_length)
+        )
+
+        # Compute loss
+        loss, dpo_losses, xpo_losses = self._compute_losses(
+            model_logprobs_model_data,
+            model_logprobs_ref_data,
+            ref_logprobs_ref_data,
+            ref_logprobs_model_data,
+            chosen_mask,
+        )
+
+        # Log everything
+        self._log_statistics(
+            model_data,
+            ref_data,
+            model_logprobs_model_data.detach(),
+            model_logprobs_ref_data.detach(),
+            ref_logprobs_ref_data,
+            ref_logprobs_model_data,
+            chosen_mask,
+            dpo_losses.detach(),
+            xpo_losses.detach(),
+            context_length,
+            model_scores,
+            ref_scores,
+        )
+
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            empty_cache()
+
+        kwargs = {}
+        # For LOMO optimizers you need to explicitly use the learning rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+
+        if self.use_apex:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            self.accelerator.backward(loss, **kwargs)
+
+        return loss.detach() / self.args.gradient_accumulation_steps
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        # normalize `tags` to a mutable set
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+
+        tags.update(self._tag_names)
+
+        # docstyle-ignore
+        citation = textwrap.dedent("""\
+        @article{jung2024binary,
+            title        = {{Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF}},
+            author       = {Tengyang Xie and Dylan J. Foster and Akshay Krishnamurthy and Corby Rosset and Ahmed Awadallah and Alexander Rakhlin},
+            year         = 2024,
+            eprint       = {arXiv:2405.21046}
+        }""")
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="XPO",
+            trainer_citation=citation,
+            paper_title="Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF",
+            paper_id="2405.21046",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+class UnslothXPOTrainer(_UnslothXPOTrainer):
+    """
+    
+Initialize XPOTrainer as a subclass of [`OnlineDPOConfig`].
+
+Args:
+    model (`transformers.PreTrainedModel`):
+        The model to train, preferably an `AutoModelForCausalLM`.
+    ref_model (`PreTrainedModelWrapper`):
+        Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+        and loss. If no reference model is provided, the trainer will create a reference model with the same
+        architecture as the model to be optimized.
+    reward_funcs (`transformers.PreTrainedModel`):
+        The reward model to score completions with, preferably an `AutoModelForSequenceClassification`.
+    judge (`BasePairwiseJudge`):
+        The judge to use for pairwise comparison of model completions.
+    args (`XPOConfig`):
+        The XPO config arguments to use for training.
+    data_collator (`transformers.DataCollator`):
+        The data collator to use for training. If None is specified, the default data collator
+        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
+        sequences in the batch, given a dataset of paired sequences.
+    train_dataset (`datasets.Dataset`):
+        The dataset to use for training.
+    eval_dataset (`datasets.Dataset`):
+        The dataset to use for evaluation.
+    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
+        Processing class used to process the data. If provided, will be used to automatically process the inputs
+        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+        reuse the fine-tuned model.
+    peft_config (`dict`):
+        The peft config to use for training.
+    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+        metric values.
+    callbacks (`list[transformers.TrainerCallback]`):
+        The callbacks to use for training.
+    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        The optimizer and scheduler to use for training.
+    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+        The function to use to preprocess the logits before computing the metrics.
+
+.. deprecated:: 0.22.0
+    The following parameters are deprecated and will be removed in a future version:
+
+    * `reward_model`: Use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.
+    * `reward_processing_class`: Use `reward_processing_classes` instead. For example, change
+      `reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.
+
+    """
+    def __init__(
+        self,
+        model = None,
+        ref_model = None,
+        reward_funcs = None,
+        judge = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        reward_processing_classes = None,
+        peft_config = None,
+        compute_metrics = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        reward_model = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothXPOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training()
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('xpo_trainer', other_metrics)
+        
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
+        super().__init__(
+            model = model,
+            ref_model = ref_model,
+            reward_funcs = reward_funcs,
+            judge = judge,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            reward_processing_classes = reward_processing_classes,
+            peft_config = peft_config,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            reward_model = reward_model,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        
+pass